From f9a4d5a14e2d345f5bee397fea45c1000f608617 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Fri, 10 Jan 2020 11:10:38 -0800 Subject: [PATCH 1/3] Track the crawling process --- api/internal/crawl/crawler/crawler.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 31cabc2b7..568dfacc0 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -105,6 +105,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C SetCreatedErrCount := 0 convErrCount := 0 deleteDocCount := 0 + crawledDocCount := 0 // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { @@ -114,7 +115,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // remove the last Document in (*docPtr) *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] + crawledDocCount++ + logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath) + if _, ok := seen[tail.ID()]; ok { + logger.Printf("this doc has been seen before") seenDocCount++ continue } @@ -132,7 +137,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C continue } - logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath) + if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) @@ -274,8 +279,12 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, wg.Add(1) go func() { defer wg.Done() + docCount := 0 for cdoc := range ch { + docCount++ + logger.Printf("Processing doc %d found on Github", docCount) if _, ok := seen[cdoc.ID()]; ok { + logger.Printf("the doc has been seen before") continue } match := findMatch(cdoc.GetDocument(), crawlers) From c801958d40746dd29ee5e2404a875424ed2cbd73 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Fri, 10 Jan 2020 11:37:22 -0800 Subject: [PATCH 2/3] Log response status code to help debug Recently, the crawler job often fails after 10+ hours with the following error (10.0.47.27:9200 is the ElasticSearch master): dial tcp 10.0.47.27:9200: connect: connection refused --- api/internal/crawl/index/elasticsearch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index 3226b4970..7d7ce2b9e 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -87,7 +87,7 @@ func (idx *index) responseErrorOrNil(info string, res *esapi.Response, defer res.Body.Close() if res.IsError() { - return fmt.Errorf("%s: %s", messageStart, res.String()) + return fmt.Errorf("%s: %s [%d]", messageStart, res.String(), res.StatusCode) } if reader != nil { From 569fafba813cf14f1231401383df1d8dbb45b971 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Sat, 11 Jan 2020 15:32:25 -0800 Subject: [PATCH 3/3] Add the Document ID pointing to a kuostomization root into cache to avoid crawl it repeatedly --- api/internal/crawl/crawler/crawler.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 568dfacc0..934a3e4ec 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -137,6 +137,14 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C continue } + // If the Document represents a kustomization root, FetchDcoument will change + // the `filePath` field of the Document by adding `kustomization.yaml` or + // `kustomization.yml` or `kustomization` into the the field. + // Therefore, it is necessary to add the ID of the Document into seen before + // calling FetchDocument. Otherwise, the binary may enter into an infinite loop + // if a kustomization file points to its kustmozation root in its `resources` or + // `bases` field. + seen[tail.ID()] = struct{}{} if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v",