From 81d62f90bfd722dba81c0cf7b6727dac81af5dd5 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 13 Jan 2020 14:56:47 -0800 Subject: [PATCH 1/4] Improve the efficency of crawling github Make sure a github file is crawled once --- api/internal/crawl/crawler/crawler.go | 8 +- api/internal/crawl/crawler/crawler_test.go | 5 +- api/internal/crawl/crawler/github/crawler.go | 115 ++++++++++++++----- 3 files changed, 96 insertions(+), 32 deletions(-) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 934a3e4ec..e1aec29b1 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -29,7 +29,7 @@ type Crawler interface { // Crawl returns when it is done processing. This method does not take // ownership of the channel. The channel is write only, and it // designates where the crawler should forward the documents. - Crawl(ctx context.Context, output chan<- CrawledDocument) error + Crawl(ctx context.Context, output chan<- CrawledDocument, seen map[string]struct{}) error // Get the document data given the FilePath, Repo, and Ref/Tag/Branch. FetchDocument(context.Context, *doc.Document) error @@ -231,7 +231,7 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, // from the seed will be processed before any other documents from the // crawlers. func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, - crawlers []Crawler) []error { + crawlers []Crawler, seen map[string]struct{}) []error { errs := make([]error, len(crawlers)) wg := sync.WaitGroup{} @@ -265,7 +265,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, } }() defer close(docs) - errs[idx] = crawler.Crawl(ctx, docs) + errs[idx] = crawler.Crawl(ctx, docs, seen) }(i, crawler, docs) // Copies the index and the crawler } @@ -306,7 +306,7 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, }() logger.Println("processing the documents found from crawling github") - if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil { + if errs := CrawlGithubRunner(ctx, ch, crawlers, seen); errs != nil { for _, err := range errs { logIfErr(err) } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 41a848612..dc4f0a067 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -75,7 +75,7 @@ func newCrawler(matchPrefix string, err error, // Crawl implements the Crawler interface for testing. func (c testCrawler) Crawl(_ context.Context, - output chan<- CrawledDocument) error { + output chan<- CrawledDocument, _ map[string]struct{}) error { for i, d := range c.docs { isResource := true @@ -181,8 +181,9 @@ func TestCrawlGithubRunner(t *testing.T) { defer close(output) defer wg.Done() + seen := map[string]struct{}{} errs := CrawlGithubRunner(context.Background(), - output, test.tc) + output, test.tc, seen) // Check that errors are returned as they should be. if !reflect.DeepEqual(errs, test.errs) { diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 0ad438169..15ff0d73b 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -30,6 +30,8 @@ var logger = log.New(os.Stdout, "Github Crawler: ", type githubCrawler struct { client GhClient query Query + // branchMap maps github repositories to their default branches + branchMap map[string]string } type GhClient struct { @@ -51,13 +53,22 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client, }, accessToken: accessToken, }, - query: query, + query: query, + branchMap: map[string]string{}, } } +func (gc githubCrawler) SetDefaultBranch(repo, branch string) { + gc.branchMap[repo] = branch +} + +func (gc githubCrawler) DefaultBranch(repo string) string { + return gc.branchMap[repo] +} + // Implements crawler.Crawler. -func (gc githubCrawler) Crawl( - ctx context.Context, output chan<- crawler.CrawledDocument) error { +func (gc githubCrawler) Crawl(ctx context.Context, + output chan<- crawler.CrawledDocument, seen map[string]struct{}) error { noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, @@ -79,17 +90,26 @@ func (gc githubCrawler) Crawl( // Query each range for files. errs := make(multiError, 0) + queryResult := RangeQueryResult{} for _, query := range ranges { - err := processQuery(ctx, gc.client, query, output) + result, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) if err != nil { errs = append(errs, err) } + queryResult.totalDocCnt += result.totalDocCnt + queryResult.seenDocCnt += result.seenDocCnt + queryResult.newDocCnt += result.newDocCnt + queryResult.errorCnt += result.errorCnt } if len(errs) > 0 { return errs } - + logger.Printf("Summary of Crawl: got %d files from Github. "+ + "%d have been seen before. %d are new and sent to the output channel." + + "%d have kustomizationResultAdapter errors.", + queryResult.totalDocCnt, queryResult.seenDocCnt, + queryResult.newDocCnt, queryResult.errorCnt) return nil } @@ -100,7 +120,7 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error // set the default branch if it is empty if d.DefaultBranch == "" { url := gc.client.ReposRequest(d.RepositoryFullName()) - defaultBranch, err := gc.client.GetDefaultBranch(url) + defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap) if err != nil { logger.Printf( "(error: %v) setting default_branch to master\n", err) @@ -108,6 +128,8 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error } d.DefaultBranch = defaultBranch } + gc.SetDefaultBranch(d.RepositoryURL, d.DefaultBranch) + repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch repoSpec, err := git.NewRepoSpecFromUrl(repoURL) if err != nil { @@ -176,10 +198,18 @@ func (gc githubCrawler) Match(d *doc.Document) bool { return strings.Contains(repoSpec.Host, "github.com") } +type RangeQueryResult struct { + totalDocCnt uint64 + seenDocCnt uint64 + newDocCnt uint64 + errorCnt uint64 +} + // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, - output chan<- crawler.CrawledDocument) error { + output chan<- crawler.CrawledDocument, seen map[string]struct{}, + branchMap map[string]string) (RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -196,50 +226,75 @@ func processQuery(ctx context.Context, gcl GhClient, query string, }() errs := make(multiError, 0) - errorCnt := 0 - totalCnt := 0 + result := RangeQueryResult{} + pageID := 1 for page := range queryPages { if page.Error != nil { errs = append(errs, page.Error) continue } - + var errorCnt, seenDocCnt, newDocCnt, totalDocCnt uint64 for _, file := range page.Parsed.Items { - k, err := kustomizationResultAdapter(gcl, file) + k, err := kustomizationResultAdapter(gcl, file, seen, branchMap) if err != nil { logger.Printf("kustomizationResultAdapter failed: %v", err) errs = append(errs, err) errorCnt++ } if k != nil { + newDocCnt++ output <- k + } else { + seenDocCnt++ } - totalCnt++ + totalDocCnt++ } - logger.Printf("got %d files out of %d from API. %d of %d had errors\n", - totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt) + logger.Printf("processQuery [page %d]: got %d files out of %d from API. "+ + "%d have been seen before. %d are new and sent to the output channel." + + "%d have kustomizationResultAdapter errors.", + pageID, totalDocCnt, page.Parsed.TotalCount, seenDocCnt, newDocCnt, errorCnt) + result.totalDocCnt += totalDocCnt + result.seenDocCnt += seenDocCnt + result.newDocCnt += newDocCnt + result.errorCnt += errorCnt + + pageID++ } - return errs + logger.Printf("Summary of processQuery: got %d files from API. "+ + "%d have been seen before. %d are new and sent to the output channel." + + " %d have kustomizationResultAdapter errors.", + result.totalDocCnt, result.seenDocCnt, result.newDocCnt, result.errorCnt) + + return result, errs } -func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) ( - crawler.CrawledDocument, error) { - - data, err := gcl.GetFileData(k) - if err != nil { - return nil, err - } - +func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen map[string]struct{}, + branchMap map[string]string) (crawler.CrawledDocument, error) { url := gcl.ReposRequest(k.Repository.FullName) - defaultBranch, err := gcl.GetDefaultBranch(url) + defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap) if err != nil { logger.Printf( "(error: %v) setting default_branch to master\n", err) defaultBranch = "master" } + document := doc.Document{ + FilePath: k.Path, + DefaultBranch: defaultBranch, + RepositoryURL: k.Repository.URL, + } + + if _, ok := seen[document.ID()]; ok { + return nil, nil + } + + data, err := gcl.GetFileData(k) + if err != nil { + return nil, err + } + d := doc.KustomizationDocument{ Document: doc.Document{ DocumentData: string(data), @@ -344,7 +399,15 @@ func CloseResponseBody(resp *http.Response) { } } -func (gcl GhClient) GetDefaultBranch(url string) (string, error) { +// GetDefaultBranch gets the default branch of a github repository. +// m is a map which maps a github repository to its default branch. +// If repo is already in m, the default branch for url will be obtained from m; +// otherwise, a query will be made to github to obtain the default branch. +func (gcl GhClient) GetDefaultBranch(url, repo string, m map[string]string) (string, error) { + if v, ok := m[repo]; ok { + return v, nil + } + resp, err := gcl.GetReposData(url) if err != nil { return "", fmt.Errorf( @@ -589,7 +652,7 @@ func (gcl GhClient) Do(query string) (*http.Response, error) { // gcl.client.Do: a non-2xx status code doesn't cause an error. // See https://golang.org/pkg/net/http/#Client.Do for more info. - resp, err := gcl.client.Do(req) + resp, err := gcl.client.Do(req) if resp != nil && resp.StatusCode != http.StatusOK { err = fmt.Errorf("GhClient.Do(%s) failed with response code: %d", query, resp.StatusCode) From 14eb524b9e1958dd7bd55b66116e54805322fd6d Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 13 Jan 2020 15:55:05 -0800 Subject: [PATCH 2/4] Add a command for searching for kustomize resource files --- api/internal/crawl/crawler/github/crawler.go | 8 +++++--- api/internal/crawl/search_cmds/keyword_search.md | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 15ff0d73b..441bdc0d8 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -102,14 +102,16 @@ func (gc githubCrawler) Crawl(ctx context.Context, queryResult.errorCnt += result.errorCnt } - if len(errs) > 0 { - return errs - } logger.Printf("Summary of Crawl: got %d files from Github. "+ "%d have been seen before. %d are new and sent to the output channel." + "%d have kustomizationResultAdapter errors.", queryResult.totalDocCnt, queryResult.seenDocCnt, queryResult.newDocCnt, queryResult.errorCnt) + + if len(errs) > 0 { + return errs + } + return nil } diff --git a/api/internal/crawl/search_cmds/keyword_search.md b/api/internal/crawl/search_cmds/keyword_search.md index db703e91a..e3c152d00 100644 --- a/api/internal/crawl/search_cmds/keyword_search.md +++ b/api/internal/crawl/search_cmds/keyword_search.md @@ -63,4 +63,20 @@ curl -X GET "${ElasticSearchURL}:9200/kustomize/_search?pretty" -H 'Content-Type } } ' +``` + +Search all the documents whose filePath does not end with any of these following +three filenames: `kustomization.yaml`, `kustomization.yml`, `kustomization`: +``` +curl -X GET "${ElasticSearchURL}:9200/kustomize/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": [ + { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + ] + } + } +} +' ``` \ No newline at end of file From 230e0ca75230125feec1b7cf0e7bdd352851581a Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 14 Jan 2020 10:41:16 -0800 Subject: [PATCH 3/4] Add two methods to type RangeQueryResult: Add and String --- api/internal/crawl/crawler/github/crawler.go | 53 ++++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 441bdc0d8..e1debd138 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -92,21 +92,14 @@ func (gc githubCrawler) Crawl(ctx context.Context, errs := make(multiError, 0) queryResult := RangeQueryResult{} for _, query := range ranges { - result, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) + rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) if err != nil { errs = append(errs, err) } - queryResult.totalDocCnt += result.totalDocCnt - queryResult.seenDocCnt += result.seenDocCnt - queryResult.newDocCnt += result.newDocCnt - queryResult.errorCnt += result.errorCnt + queryResult.Add(rangeResult) } - logger.Printf("Summary of Crawl: got %d files from Github. "+ - "%d have been seen before. %d are new and sent to the output channel." + - "%d have kustomizationResultAdapter errors.", - queryResult.totalDocCnt, queryResult.seenDocCnt, - queryResult.newDocCnt, queryResult.errorCnt) + logger.Printf("Summary of Crawl: %s", queryResult.String()) if len(errs) > 0 { return errs @@ -207,6 +200,20 @@ type RangeQueryResult struct { errorCnt uint64 } +func (r *RangeQueryResult) Add(other RangeQueryResult) { + r.totalDocCnt += other.totalDocCnt + r.newDocCnt += other.newDocCnt + r.seenDocCnt += other.seenDocCnt + r.errorCnt += other.errorCnt +} + +func (r *RangeQueryResult) String() string { + return fmt.Sprintf("got %d files from API. "+ + "%d have been seen before. %d are new and sent to the output channel." + + " %d have kustomizationResultAdapter errors.", + r.totalDocCnt, r.seenDocCnt, r.newDocCnt, r.errorCnt) +} + // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, @@ -235,39 +242,31 @@ func processQuery(ctx context.Context, gcl GhClient, query string, errs = append(errs, page.Error) continue } - var errorCnt, seenDocCnt, newDocCnt, totalDocCnt uint64 + pageResult := RangeQueryResult{} for _, file := range page.Parsed.Items { k, err := kustomizationResultAdapter(gcl, file, seen, branchMap) if err != nil { logger.Printf("kustomizationResultAdapter failed: %v", err) errs = append(errs, err) - errorCnt++ + pageResult.errorCnt++ } if k != nil { - newDocCnt++ + pageResult.newDocCnt++ output <- k } else { - seenDocCnt++ + pageResult.seenDocCnt++ } - totalDocCnt++ + pageResult.totalDocCnt++ } - logger.Printf("processQuery [page %d]: got %d files out of %d from API. "+ - "%d have been seen before. %d are new and sent to the output channel." + - "%d have kustomizationResultAdapter errors.", - pageID, totalDocCnt, page.Parsed.TotalCount, seenDocCnt, newDocCnt, errorCnt) - result.totalDocCnt += totalDocCnt - result.seenDocCnt += seenDocCnt - result.newDocCnt += newDocCnt - result.errorCnt += errorCnt + logger.Printf("processQuery [TotalCount %d - page %d]: %s", + page.Parsed.TotalCount, pageID, pageResult.String()) + result.Add(pageResult) pageID++ } - logger.Printf("Summary of processQuery: got %d files from API. "+ - "%d have been seen before. %d are new and sent to the output channel." + - " %d have kustomizationResultAdapter errors.", - result.totalDocCnt, result.seenDocCnt, result.newDocCnt, result.errorCnt) + logger.Printf("Summary of processQuery: %s", result.String()) return result, errs } From 72eda992bd25389d9fa9fe5f070527be12616c30 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 14 Jan 2020 11:03:29 -0800 Subject: [PATCH 4/4] make `seen` a non-primitive type --- api/internal/crawl/cmd/crawler/crawler.go | 2 +- api/internal/crawl/crawler/crawler.go | 40 ++++++++++++++------ api/internal/crawl/crawler/crawler_test.go | 6 +-- api/internal/crawl/crawler/github/crawler.go | 8 ++-- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 6831fed87..63df31409 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -121,7 +121,7 @@ func main() { // seen tracks the IDs of all the documents in the index. // This helps avoid indexing a given document multiple times. - seen := make(map[string]struct{}) + seen := crawler.NewSeenMap() var mode CrawlMode if len(os.Args) == 1 { diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index e1aec29b1..b8f9d3874 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -29,7 +29,7 @@ type Crawler interface { // Crawl returns when it is done processing. This method does not take // ownership of the channel. The channel is write only, and it // designates where the crawler should forward the documents. - Crawl(ctx context.Context, output chan<- CrawledDocument, seen map[string]struct{}) error + Crawl(ctx context.Context, output chan<- CrawledDocument, seen SeenMap) error // Get the document data given the FilePath, Repo, and Ref/Tag/Branch. FetchDocument(context.Context, *doc.Document) error @@ -47,6 +47,21 @@ type CrawledDocument interface { WasCached() bool } +type SeenMap map[string]struct{} + +func (seen SeenMap) Seen(item string) bool { + _, ok := seen[item] + return ok +} + +func (seen SeenMap) Add(item string) { + seen[item] = struct{}{} +} + +func NewSeenMap() SeenMap { + return make(map[string]struct{}) +} + type CrawlSeed []*doc.Document type IndexFunc func(CrawledDocument, index.Mode) error @@ -69,9 +84,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler { } func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, - seen map[string]struct{}, stack *CrawlSeed) { + seen SeenMap, stack *CrawlSeed) { - seen[cdoc.ID()] = struct{}{} + seen.Add(cdoc.ID()) // Insert into index if err := indx(cdoc, index.InsertOrUpdate); err != nil { @@ -87,7 +102,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } for _, dep := range deps { - if _, ok := seen[dep.ID()]; ok { + if seen.Seen(dep.ID()) { continue } *stack = append(*stack, dep) @@ -95,7 +110,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, - seen map[string]struct{}, stack *CrawlSeed) { + seen SeenMap, stack *CrawlSeed) { UpdatedDocCount := 0 seenDocCount := 0 @@ -118,7 +133,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C crawledDocCount++ logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath) - if _, ok := seen[tail.ID()]; ok { + if seen.Seen(tail.ID()) { logger.Printf("this doc has been seen before") seenDocCount++ continue @@ -144,7 +159,8 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // calling FetchDocument. Otherwise, the binary may enter into an infinite loop // if a kustomization file points to its kustmozation root in its `resources` or // `bases` field. - seen[tail.ID()] = struct{}{} + seen.Add(tail.ID()) + if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", @@ -154,7 +170,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C cdoc := &doc.KustomizationDocument{ Document: *tail, } - seen[cdoc.ID()] = struct{}{} + seen.Add(cdoc.ID()) if err := indx(cdoc, index.Delete); err != nil { logger.Printf("Failed to delete %s %s: %v", cdoc.RepositoryURL, cdoc.FilePath, err) @@ -195,7 +211,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // CrawlFromSeed updates all the documents in seed, and crawls all the new // documents referred in the seed. func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, - conv Converter, indx IndexFunc, seen map[string]struct{}) { + conv Converter, indx IndexFunc, seen SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -231,7 +247,7 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, // from the seed will be processed before any other documents from the // crawlers. func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, - crawlers []Crawler, seen map[string]struct{}) []error { + crawlers []Crawler, seen SeenMap) []error { errs := make([]error, len(crawlers)) wg := sync.WaitGroup{} @@ -275,7 +291,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, // CrawlGithub crawls all the kustomization files on Github. func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, - indx IndexFunc, seen map[string]struct{}) { + indx IndexFunc, seen SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -291,7 +307,7 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, for cdoc := range ch { docCount++ logger.Printf("Processing doc %d found on Github", docCount) - if _, ok := seen[cdoc.ID()]; ok { + if seen.Seen(cdoc.ID()) { logger.Printf("the doc has been seen before") continue } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index dc4f0a067..383e834a5 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -75,7 +75,7 @@ func newCrawler(matchPrefix string, err error, // Crawl implements the Crawler interface for testing. func (c testCrawler) Crawl(_ context.Context, - output chan<- CrawledDocument, _ map[string]struct{}) error { + output chan<- CrawledDocument, _ SeenMap) error { for i, d := range c.docs { isResource := true @@ -181,7 +181,7 @@ func TestCrawlGithubRunner(t *testing.T) { defer close(output) defer wg.Done() - seen := map[string]struct{}{} + seen := NewSeenMap() errs := CrawlGithubRunner(context.Background(), output, test.tc, seen) @@ -323,7 +323,7 @@ resources: visited[d.ID()]++ return nil }, - make(map[string]struct{}), + NewSeenMap(), ) if lv, lc := len(visited), len(tc.corpus); lv != lc { t.Errorf("error: %d of %d documents visited.", lv, lc) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index e1debd138..046ba4af0 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -68,7 +68,7 @@ func (gc githubCrawler) DefaultBranch(repo string) string { // Implements crawler.Crawler. func (gc githubCrawler) Crawl(ctx context.Context, - output chan<- crawler.CrawledDocument, seen map[string]struct{}) error { + output chan<- crawler.CrawledDocument, seen crawler.SeenMap) error { noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, @@ -217,7 +217,7 @@ func (r *RangeQueryResult) String() string { // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, - output chan<- crawler.CrawledDocument, seen map[string]struct{}, + output chan<- crawler.CrawledDocument, seen crawler.SeenMap, branchMap map[string]string) (RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -271,7 +271,7 @@ func processQuery(ctx context.Context, gcl GhClient, query string, return result, errs } -func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen map[string]struct{}, +func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen crawler.SeenMap, branchMap map[string]string) (crawler.CrawledDocument, error) { url := gcl.ReposRequest(k.Repository.FullName) defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap) @@ -287,7 +287,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen map[string]stru RepositoryURL: k.Repository.URL, } - if _, ok := seen[document.ID()]; ok { + if seen.Seen(document.ID()) { return nil, nil }