mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-13 10:00:56 +00:00
Merge pull request #2150 from haiyanmeng/stats
Add `fileType` and `User` into the index
This commit is contained in:
@@ -38,6 +38,8 @@ type Crawler interface {
|
||||
// Write to the document what the created time is.
|
||||
SetCreated(context.Context, *doc.Document) error
|
||||
|
||||
SetDefaultBranch(*doc.Document)
|
||||
|
||||
Match(*doc.Document) bool
|
||||
}
|
||||
|
||||
@@ -78,7 +80,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
|
||||
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
||||
seen utils.SeenMap, stack *CrawlSeed) {
|
||||
|
||||
seen.Add(cdoc.ID())
|
||||
seen.Set(cdoc.ID(), cdoc.GetDocument().FileType)
|
||||
|
||||
match.SetDefaultBranch(cdoc.GetDocument())
|
||||
|
||||
// Insert into index
|
||||
if err := indx(cdoc, index.InsertOrUpdate); err != nil {
|
||||
@@ -87,14 +91,14 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
||||
return
|
||||
}
|
||||
|
||||
deps, err := cdoc.GetResources(true, false, false)
|
||||
deps, err := cdoc.GetResources(true, true, true)
|
||||
if err != nil {
|
||||
logger.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, dep := range deps {
|
||||
if seen.Seen(dep.ID()) {
|
||||
if seen.Seen(dep.ID()) && seen.Value(dep.ID()) == dep.FileType {
|
||||
continue
|
||||
}
|
||||
*stack = append(*stack, dep)
|
||||
@@ -102,7 +106,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
||||
}
|
||||
|
||||
func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc,
|
||||
seen utils.SeenMap, stack *CrawlSeed) {
|
||||
seen utils.SeenMap, stack *CrawlSeed, refreshDoc bool, updateFileType bool) {
|
||||
|
||||
UpdatedDocCount := 0
|
||||
seenDocCount := 0
|
||||
@@ -126,9 +130,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
||||
logger.Printf("Crawling doc %d: %s", crawledDocCount, tail.Path())
|
||||
|
||||
if seen.Seen(tail.ID()) {
|
||||
logger.Printf("this doc has been seen before")
|
||||
seenDocCount++
|
||||
continue
|
||||
if !updateFileType || seen.Value(tail.ID()) == tail.FileType {
|
||||
logger.Printf("this doc has been seen before")
|
||||
seenDocCount++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if tail.WasCached() {
|
||||
@@ -144,6 +150,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
||||
continue
|
||||
}
|
||||
|
||||
if tail.User == "" {
|
||||
tail.User = doc.UserName(tail.RepositoryURL)
|
||||
}
|
||||
|
||||
// If the Document represents a kustomization root, FetchDcoument will change
|
||||
// the `filePath` field of the Document by adding `kustomization.yaml` or
|
||||
// `kustomization.yml` or `kustomization` into the the field.
|
||||
@@ -151,26 +161,34 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
||||
// calling FetchDocument. Otherwise, the binary may enter into an infinite loop
|
||||
// if a kustomization file points to its kustmozation root in its `resources` or
|
||||
// `bases` field.
|
||||
seen.Add(tail.ID())
|
||||
seen.Set(tail.ID(), tail.FileType)
|
||||
|
||||
if err := match.FetchDocument(ctx, tail); err != nil {
|
||||
logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err)
|
||||
FetchDocumentErrCount++
|
||||
// delete the document from the index
|
||||
cdoc := &doc.KustomizationDocument{
|
||||
Document: *tail,
|
||||
}
|
||||
seen.Add(cdoc.ID())
|
||||
if err := indx(cdoc, index.Delete); err != nil {
|
||||
logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err)
|
||||
}
|
||||
deleteDocCount++
|
||||
continue
|
||||
if refreshDoc || tail.DefaultBranch == "" {
|
||||
match.SetDefaultBranch(tail)
|
||||
}
|
||||
|
||||
if err := match.SetCreated(ctx, tail); err != nil {
|
||||
logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err)
|
||||
SetCreatedErrCount++
|
||||
if refreshDoc || tail.DocumentData == "" {
|
||||
if err := match.FetchDocument(ctx, tail); err != nil {
|
||||
logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err)
|
||||
FetchDocumentErrCount++
|
||||
// delete the document from the index
|
||||
cdoc := &doc.KustomizationDocument{
|
||||
Document: *tail,
|
||||
}
|
||||
seen.Set(cdoc.ID(), tail.FileType)
|
||||
if err := indx(cdoc, index.Delete); err != nil {
|
||||
logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err)
|
||||
}
|
||||
deleteDocCount++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if refreshDoc || tail.CreationTime == nil {
|
||||
if err := match.SetCreated(ctx, tail); err != nil {
|
||||
logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err)
|
||||
SetCreatedErrCount++
|
||||
}
|
||||
}
|
||||
|
||||
cdoc, err := conv(tail)
|
||||
@@ -206,14 +224,14 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
||||
// Exploit seed to update bulk of corpus.
|
||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||
// each unique document in seed will be crawled once.
|
||||
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack)
|
||||
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false)
|
||||
|
||||
// Traverse any new documents added while updating corpus.
|
||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
||||
// While crawling each document in stack, the documents directly referred in the document
|
||||
// will be added into stack.
|
||||
// After this statement is done, stack will become empty.
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack)
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
||||
}
|
||||
|
||||
// CrawlGithubRunner is a blocking function and only returns once all of the
|
||||
@@ -294,6 +312,8 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||
for cdoc := range ch {
|
||||
docCount++
|
||||
logger.Printf("Processing doc %d found on Github", docCount)
|
||||
// all the docs here are kustomization files found by querying Github, and
|
||||
// their `FileType` fields all should be empty.
|
||||
if seen.Seen(cdoc.ID()) {
|
||||
logger.Printf("the doc has been seen before")
|
||||
continue
|
||||
@@ -320,5 +340,5 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||
// Handle deps of newly discovered documents.
|
||||
logger.Printf("crawling the %d new documents referred by other documents",
|
||||
len(stack))
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack)
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
||||
}
|
||||
|
||||
@@ -37,6 +37,8 @@ func (c testCrawler) Match(d *doc.Document) bool {
|
||||
return d != nil
|
||||
}
|
||||
|
||||
func (c testCrawler) SetDefaultBranch(d *doc.Document) {}
|
||||
|
||||
func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
|
||||
if i, ok := c.lukp[d.ID()]; ok {
|
||||
d.DocumentData = c.docs[i].DocumentData
|
||||
|
||||
@@ -60,8 +60,16 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
}
|
||||
}
|
||||
|
||||
func (gc githubCrawler) SetDefaultBranch(repo, branch string) {
|
||||
gc.branchMap[repo] = branch
|
||||
func (gc githubCrawler) SetDefaultBranch(d *doc.Document) {
|
||||
url := gc.client.ReposRequest(d.RepositoryFullName())
|
||||
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
|
||||
if err != nil {
|
||||
logger.Printf(
|
||||
"(error: %v) setting default_branch to master\n", err)
|
||||
defaultBranch = "master"
|
||||
}
|
||||
d.DefaultBranch = defaultBranch
|
||||
gc.branchMap[d.RepositoryURL] = d.DefaultBranch
|
||||
}
|
||||
|
||||
func (gc githubCrawler) DefaultBranch(repo string) string {
|
||||
@@ -79,10 +87,20 @@ func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
accessToken: gc.client.accessToken,
|
||||
}
|
||||
|
||||
var ranges []string
|
||||
var err error
|
||||
// Since Github returns a max of 1000 results per query, we can use
|
||||
// multiple queries that split the search space into chunks of at most
|
||||
// 1000 files to get all of the data.
|
||||
ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
|
||||
for i := 0; i < 5; i++ {
|
||||
ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query))
|
||||
if err == nil {
|
||||
logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i)
|
||||
break
|
||||
} else {
|
||||
time.Sleep(time.Minute)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not split %v into ranges, %v\n",
|
||||
gc.query, err)
|
||||
@@ -114,19 +132,6 @@ func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
// it will try to add each string in konfig.RecognizedKustomizationFileNames() to
|
||||
// d.FilePath, and try to fetch the document again.
|
||||
func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
|
||||
// set the default branch if it is empty
|
||||
if d.DefaultBranch == "" {
|
||||
url := gc.client.ReposRequest(d.RepositoryFullName())
|
||||
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
|
||||
if err != nil {
|
||||
logger.Printf(
|
||||
"(error: %v) setting default_branch to master\n", err)
|
||||
defaultBranch = "master"
|
||||
}
|
||||
d.DefaultBranch = defaultBranch
|
||||
}
|
||||
gc.SetDefaultBranch(d.RepositoryURL, d.DefaultBranch)
|
||||
|
||||
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
|
||||
repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
|
||||
if err != nil {
|
||||
@@ -283,10 +288,13 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
|
||||
defaultBranch = "master"
|
||||
}
|
||||
|
||||
// document here is a kustomization file found by querying Github, whose
|
||||
// `FileType` field should be empty.
|
||||
document := doc.Document{
|
||||
FilePath: k.Path,
|
||||
DefaultBranch: defaultBranch,
|
||||
RepositoryURL: k.Repository.URL,
|
||||
User: doc.UserName(k.Repository.URL),
|
||||
}
|
||||
|
||||
if seen.Seen(document.ID()) {
|
||||
@@ -304,6 +312,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
|
||||
FilePath: k.Path,
|
||||
DefaultBranch: defaultBranch,
|
||||
RepositoryURL: k.Repository.URL,
|
||||
User: doc.UserName(k.Repository.URL),
|
||||
},
|
||||
}
|
||||
creationTime, err := gcl.GetFileCreationTime(k)
|
||||
|
||||
Reference in New Issue
Block a user