diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 31cabc2b7..934a3e4ec 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -105,6 +105,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C SetCreatedErrCount := 0 convErrCount := 0 deleteDocCount := 0 + crawledDocCount := 0 // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { @@ -114,7 +115,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // remove the last Document in (*docPtr) *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] + crawledDocCount++ + logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath) + if _, ok := seen[tail.ID()]; ok { + logger.Printf("this doc has been seen before") seenDocCount++ continue } @@ -132,7 +137,15 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C continue } - logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath) + // If the Document represents a kustomization root, FetchDcoument will change + // the `filePath` field of the Document by adding `kustomization.yaml` or + // `kustomization.yml` or `kustomization` into the the field. + // Therefore, it is necessary to add the ID of the Document into seen before + // calling FetchDocument. Otherwise, the binary may enter into an infinite loop + // if a kustomization file points to its kustmozation root in its `resources` or + // `bases` field. + seen[tail.ID()] = struct{}{} + if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) @@ -274,8 +287,12 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, wg.Add(1) go func() { defer wg.Done() + docCount := 0 for cdoc := range ch { + docCount++ + logger.Printf("Processing doc %d found on Github", docCount) if _, ok := seen[cdoc.ID()]; ok { + logger.Printf("the doc has been seen before") continue } match := findMatch(cdoc.GetDocument(), crawlers) diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index 3226b4970..7d7ce2b9e 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -87,7 +87,7 @@ func (idx *index) responseErrorOrNil(info string, res *esapi.Response, defer res.Body.Close() if res.IsError() { - return fmt.Errorf("%s: %s", messageStart, res.String()) + return fmt.Errorf("%s: %s [%d]", messageStart, res.String(), res.StatusCode) } if reader != nil {