Merge pull request #2090 from haiyanmeng/retry

Add the Document ID pointing to a kuostomization root into cache to avoid crawling it repeatedly
This commit is contained in:
Jeff Regan
2020-01-13 10:53:01 -08:00
committed by GitHub
2 changed files with 19 additions and 2 deletions

View File

@@ -105,6 +105,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
SetCreatedErrCount := 0
convErrCount := 0
deleteDocCount := 0
crawledDocCount := 0
// During the execution of the for loop, more Documents may be added into (*docsPtr).
for len(*docsPtr) > 0 {
@@ -114,7 +115,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// remove the last Document in (*docPtr)
*docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)]
crawledDocCount++
logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath)
if _, ok := seen[tail.ID()]; ok {
logger.Printf("this doc has been seen before")
seenDocCount++
continue
}
@@ -132,7 +137,15 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
continue
}
logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath)
// If the Document represents a kustomization root, FetchDcoument will change
// the `filePath` field of the Document by adding `kustomization.yaml` or
// `kustomization.yml` or `kustomization` into the the field.
// Therefore, it is necessary to add the ID of the Document into seen before
// calling FetchDocument. Otherwise, the binary may enter into an infinite loop
// if a kustomization file points to its kustmozation root in its `resources` or
// `bases` field.
seen[tail.ID()] = struct{}{}
if err := match.FetchDocument(ctx, tail); err != nil {
logger.Printf("FetchDocument failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err)
@@ -274,8 +287,12 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
wg.Add(1)
go func() {
defer wg.Done()
docCount := 0
for cdoc := range ch {
docCount++
logger.Printf("Processing doc %d found on Github", docCount)
if _, ok := seen[cdoc.ID()]; ok {
logger.Printf("the doc has been seen before")
continue
}
match := findMatch(cdoc.GetDocument(), crawlers)

View File

@@ -87,7 +87,7 @@ func (idx *index) responseErrorOrNil(info string, res *esapi.Response,
defer res.Body.Close()
if res.IsError() {
return fmt.Errorf("%s: %s", messageStart, res.String())
return fmt.Errorf("%s: %s [%d]", messageStart, res.String(), res.StatusCode)
}
if reader != nil {