diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 6831fed87..63df31409 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -121,7 +121,7 @@ func main() { // seen tracks the IDs of all the documents in the index. // This helps avoid indexing a given document multiple times. - seen := make(map[string]struct{}) + seen := crawler.NewSeenMap() var mode CrawlMode if len(os.Args) == 1 { diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index e1aec29b1..b8f9d3874 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -29,7 +29,7 @@ type Crawler interface { // Crawl returns when it is done processing. This method does not take // ownership of the channel. The channel is write only, and it // designates where the crawler should forward the documents. - Crawl(ctx context.Context, output chan<- CrawledDocument, seen map[string]struct{}) error + Crawl(ctx context.Context, output chan<- CrawledDocument, seen SeenMap) error // Get the document data given the FilePath, Repo, and Ref/Tag/Branch. FetchDocument(context.Context, *doc.Document) error @@ -47,6 +47,21 @@ type CrawledDocument interface { WasCached() bool } +type SeenMap map[string]struct{} + +func (seen SeenMap) Seen(item string) bool { + _, ok := seen[item] + return ok +} + +func (seen SeenMap) Add(item string) { + seen[item] = struct{}{} +} + +func NewSeenMap() SeenMap { + return make(map[string]struct{}) +} + type CrawlSeed []*doc.Document type IndexFunc func(CrawledDocument, index.Mode) error @@ -69,9 +84,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler { } func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, - seen map[string]struct{}, stack *CrawlSeed) { + seen SeenMap, stack *CrawlSeed) { - seen[cdoc.ID()] = struct{}{} + seen.Add(cdoc.ID()) // Insert into index if err := indx(cdoc, index.InsertOrUpdate); err != nil { @@ -87,7 +102,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } for _, dep := range deps { - if _, ok := seen[dep.ID()]; ok { + if seen.Seen(dep.ID()) { continue } *stack = append(*stack, dep) @@ -95,7 +110,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, - seen map[string]struct{}, stack *CrawlSeed) { + seen SeenMap, stack *CrawlSeed) { UpdatedDocCount := 0 seenDocCount := 0 @@ -118,7 +133,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C crawledDocCount++ logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath) - if _, ok := seen[tail.ID()]; ok { + if seen.Seen(tail.ID()) { logger.Printf("this doc has been seen before") seenDocCount++ continue @@ -144,7 +159,8 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // calling FetchDocument. Otherwise, the binary may enter into an infinite loop // if a kustomization file points to its kustmozation root in its `resources` or // `bases` field. - seen[tail.ID()] = struct{}{} + seen.Add(tail.ID()) + if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", @@ -154,7 +170,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C cdoc := &doc.KustomizationDocument{ Document: *tail, } - seen[cdoc.ID()] = struct{}{} + seen.Add(cdoc.ID()) if err := indx(cdoc, index.Delete); err != nil { logger.Printf("Failed to delete %s %s: %v", cdoc.RepositoryURL, cdoc.FilePath, err) @@ -195,7 +211,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // CrawlFromSeed updates all the documents in seed, and crawls all the new // documents referred in the seed. func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, - conv Converter, indx IndexFunc, seen map[string]struct{}) { + conv Converter, indx IndexFunc, seen SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -231,7 +247,7 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, // from the seed will be processed before any other documents from the // crawlers. func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, - crawlers []Crawler, seen map[string]struct{}) []error { + crawlers []Crawler, seen SeenMap) []error { errs := make([]error, len(crawlers)) wg := sync.WaitGroup{} @@ -275,7 +291,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, // CrawlGithub crawls all the kustomization files on Github. func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, - indx IndexFunc, seen map[string]struct{}) { + indx IndexFunc, seen SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -291,7 +307,7 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, for cdoc := range ch { docCount++ logger.Printf("Processing doc %d found on Github", docCount) - if _, ok := seen[cdoc.ID()]; ok { + if seen.Seen(cdoc.ID()) { logger.Printf("the doc has been seen before") continue } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index dc4f0a067..383e834a5 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -75,7 +75,7 @@ func newCrawler(matchPrefix string, err error, // Crawl implements the Crawler interface for testing. func (c testCrawler) Crawl(_ context.Context, - output chan<- CrawledDocument, _ map[string]struct{}) error { + output chan<- CrawledDocument, _ SeenMap) error { for i, d := range c.docs { isResource := true @@ -181,7 +181,7 @@ func TestCrawlGithubRunner(t *testing.T) { defer close(output) defer wg.Done() - seen := map[string]struct{}{} + seen := NewSeenMap() errs := CrawlGithubRunner(context.Background(), output, test.tc, seen) @@ -323,7 +323,7 @@ resources: visited[d.ID()]++ return nil }, - make(map[string]struct{}), + NewSeenMap(), ) if lv, lc := len(visited), len(tc.corpus); lv != lc { t.Errorf("error: %d of %d documents visited.", lv, lc) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index e1debd138..046ba4af0 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -68,7 +68,7 @@ func (gc githubCrawler) DefaultBranch(repo string) string { // Implements crawler.Crawler. func (gc githubCrawler) Crawl(ctx context.Context, - output chan<- crawler.CrawledDocument, seen map[string]struct{}) error { + output chan<- crawler.CrawledDocument, seen crawler.SeenMap) error { noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, @@ -217,7 +217,7 @@ func (r *RangeQueryResult) String() string { // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, - output chan<- crawler.CrawledDocument, seen map[string]struct{}, + output chan<- crawler.CrawledDocument, seen crawler.SeenMap, branchMap map[string]string) (RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -271,7 +271,7 @@ func processQuery(ctx context.Context, gcl GhClient, query string, return result, errs } -func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen map[string]struct{}, +func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen crawler.SeenMap, branchMap map[string]string) (crawler.CrawledDocument, error) { url := gcl.ReposRequest(k.Repository.FullName) defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap) @@ -287,7 +287,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen map[string]stru RepositoryURL: k.Repository.URL, } - if _, ok := seen[document.ID()]; ok { + if seen.Seen(document.ID()) { return nil, nil }