diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 4a7883e54..2d301efe1 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -9,6 +9,8 @@ import ( "os" "time" + "sigs.k8s.io/kustomize/api/internal/crawl/utils" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" "sigs.k8s.io/kustomize/api/internal/crawl/doc" @@ -26,6 +28,7 @@ const ( ) type CrawlMode int + const ( CrawlUnknown CrawlMode = iota // Crawl all the kustomization files in all the repositories of a Github user @@ -125,13 +128,13 @@ func main() { // seen tracks the IDs of all the documents in the index. // This helps avoid indexing a given document multiple times. - seen := crawler.NewSeenMap() + seen := utils.NewSeenMap() mode := NewCrawlMode(*modePtr) ghCrawlerConstructor := func(user, repo string) crawler.Crawler { if user != "" { - return github.NewCrawler(githubToken, retryCount, clientCache, + return github.NewCrawler(githubToken, retryCount, clientCache, github.QueryWith( github.Filename("kustomization.yaml"), github.Filename("kustomization.yml"), diff --git a/api/internal/crawl/cmd/kustomize_stats/main.go b/api/internal/crawl/cmd/kustomize_stats/main.go index 234d1327c..d3a436ebe 100644 --- a/api/internal/crawl/cmd/kustomize_stats/main.go +++ b/api/internal/crawl/cmd/kustomize_stats/main.go @@ -7,7 +7,6 @@ import ( "log" "net/http" "os" - "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sort" "time" @@ -51,10 +50,10 @@ func GeneratorAndTransformerStats(ctx context.Context, generatorDocs []*doc.Document, transformerDocs []*doc.Document, idx *index.KustomizeIndex) { // allGenerators includes all the documents referred in the generators field - allGenerators := crawler.NewUniqueDocuments() + allGenerators := doc.NewUniqueDocuments() // allTransformers includes all the documents referred in the transformers field - allTransformers := crawler.NewUniqueDocuments() + allTransformers := doc.NewUniqueDocuments() // docUsingGeneratorCount counts the number of the kustomization files using generators docUsingGeneratorCount := 0 diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 2f7026e7a..11715616d 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -10,6 +10,8 @@ import ( "os" "sync" + "sigs.k8s.io/kustomize/api/internal/crawl/utils" + "sigs.k8s.io/kustomize/api/internal/crawl/index" _ "github.com/gomodule/redigo/redis" @@ -29,7 +31,7 @@ type Crawler interface { // Crawl returns when it is done processing. This method does not take // ownership of the channel. The channel is write only, and it // designates where the crawler should forward the documents. - Crawl(ctx context.Context, output chan<- CrawledDocument, seen SeenMap) error + Crawl(ctx context.Context, output chan<- CrawledDocument, seen utils.SeenMap) error // Get the document data given the FilePath, Repo, and Ref/Tag/Branch. FetchDocument(context.Context, *doc.Document) error @@ -52,21 +54,6 @@ type CrawledDocument interface { WasCached() bool } -type SeenMap map[string]struct{} - -func (seen SeenMap) Seen(item string) bool { - _, ok := seen[item] - return ok -} - -func (seen SeenMap) Add(item string) { - seen[item] = struct{}{} -} - -func NewSeenMap() SeenMap { - return make(map[string]struct{}) -} - type CrawlSeed []*doc.Document type IndexFunc func(CrawledDocument, index.Mode) error @@ -89,7 +76,7 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler { } func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, - seen SeenMap, stack *CrawlSeed) { + seen utils.SeenMap, stack *CrawlSeed) { seen.Add(cdoc.ID()) @@ -115,7 +102,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, - seen SeenMap, stack *CrawlSeed) { + seen utils.SeenMap, stack *CrawlSeed) { UpdatedDocCount := 0 seenDocCount := 0 @@ -166,7 +153,6 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // `bases` field. seen.Add(tail.ID()) - if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err) FetchDocumentErrCount++ @@ -212,7 +198,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // CrawlFromSeed updates all the documents in seed, and crawls all the new // documents referred in the seed. func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, - conv Converter, indx IndexFunc, seen SeenMap) { + conv Converter, indx IndexFunc, seen utils.SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -248,7 +234,7 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, // from the seed will be processed before any other documents from the // crawlers. func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, - crawlers []Crawler, seen SeenMap) []error { + crawlers []Crawler, seen utils.SeenMap) []error { errs := make([]error, len(crawlers)) wg := sync.WaitGroup{} @@ -292,7 +278,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, // CrawlGithub crawls all the kustomization files on Github. func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, - indx IndexFunc, seen SeenMap) { + indx IndexFunc, seen utils.SeenMap) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 7dace4da6..d18a4afce 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -12,6 +12,8 @@ import ( "testing" "time" + "sigs.k8s.io/kustomize/api/internal/crawl/utils" + "sigs.k8s.io/kustomize/api/internal/crawl/index" "sigs.k8s.io/kustomize/api/internal/crawl/doc" @@ -76,7 +78,7 @@ func newCrawler(matchPrefix string, err error, // Crawl implements the Crawler interface for testing. func (c testCrawler) Crawl(_ context.Context, - output chan<- CrawledDocument, _ SeenMap) error { + output chan<- CrawledDocument, _ utils.SeenMap) error { for i, d := range c.docs { isResource := true @@ -182,7 +184,7 @@ func TestCrawlGithubRunner(t *testing.T) { defer close(output) defer wg.Done() - seen := NewSeenMap() + seen := utils.NewSeenMap() errs := CrawlGithubRunner(context.Background(), output, test.tc, seen) @@ -324,7 +326,7 @@ resources: visited[d.ID()]++ return nil }, - NewSeenMap(), + utils.NewSeenMap(), ) if lv, lc := len(visited), len(tc.corpus); lv != lc { t.Errorf("error: %d of %d documents visited.", lv, lc) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 046ba4af0..e1aeff401 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -16,6 +16,8 @@ import ( "strings" "time" + "sigs.k8s.io/kustomize/api/internal/crawl/utils" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" @@ -68,7 +70,7 @@ func (gc githubCrawler) DefaultBranch(repo string) string { // Implements crawler.Crawler. func (gc githubCrawler) Crawl(ctx context.Context, - output chan<- crawler.CrawledDocument, seen crawler.SeenMap) error { + output chan<- crawler.CrawledDocument, seen utils.SeenMap) error { noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, @@ -195,9 +197,9 @@ func (gc githubCrawler) Match(d *doc.Document) bool { type RangeQueryResult struct { totalDocCnt uint64 - seenDocCnt uint64 - newDocCnt uint64 - errorCnt uint64 + seenDocCnt uint64 + newDocCnt uint64 + errorCnt uint64 } func (r *RangeQueryResult) Add(other RangeQueryResult) { @@ -209,7 +211,7 @@ func (r *RangeQueryResult) Add(other RangeQueryResult) { func (r *RangeQueryResult) String() string { return fmt.Sprintf("got %d files from API. "+ - "%d have been seen before. %d are new and sent to the output channel." + + "%d have been seen before. %d are new and sent to the output channel."+ " %d have kustomizationResultAdapter errors.", r.totalDocCnt, r.seenDocCnt, r.newDocCnt, r.errorCnt) } @@ -217,7 +219,7 @@ func (r *RangeQueryResult) String() string { // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, - output chan<- crawler.CrawledDocument, seen crawler.SeenMap, + output chan<- crawler.CrawledDocument, seen utils.SeenMap, branchMap map[string]string) (RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -271,7 +273,7 @@ func processQuery(ctx context.Context, gcl GhClient, query string, return result, errs } -func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen crawler.SeenMap, +func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, branchMap map[string]string) (crawler.CrawledDocument, error) { url := gcl.ReposRequest(k.Repository.FullName) defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap) diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 557e0f371..ba05af38f 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -117,7 +117,7 @@ type RequestConfig struct { // understand why the request object is useful. func (rc RequestConfig) CodeSearchRequestWith(query Query) request { vals := url.Values{ - "sort": []string{"indexed"}, + "sort": []string{"indexed"}, "order": []string{"desc"}, } req := rc.makeRequest("search/code", query, vals) diff --git a/api/internal/crawl/doc/docname_test.go b/api/internal/crawl/doc/docname_test.go index f1b65dc8f..a03beaf06 100644 --- a/api/internal/crawl/doc/docname_test.go +++ b/api/internal/crawl/doc/docname_test.go @@ -65,7 +65,7 @@ func TestFromRelativePath(t *testing.T) { func TestDocument_RepositoryFullName(t *testing.T) { testCases := []struct { - doc Document + doc Document expectedRepositoryFullName string }{ { @@ -108,4 +108,4 @@ func TestDocument_RepositoryFullName(t *testing.T) { returnedRepositoryFullName) } } -} \ No newline at end of file +} diff --git a/api/internal/crawl/doc/unique_doc.go b/api/internal/crawl/doc/unique_doc.go new file mode 100644 index 000000000..026b345a5 --- /dev/null +++ b/api/internal/crawl/doc/unique_doc.go @@ -0,0 +1,36 @@ +package doc + +import ( + "sigs.k8s.io/kustomize/api/internal/crawl/utils" +) + +// UniqueDocuments make sure a Document with a given ID appears only once +type UniqueDocuments struct { + docs []*Document + docIDs utils.SeenMap +} + +func NewUniqueDocuments() UniqueDocuments { + return UniqueDocuments{ + docs: []*Document{}, + docIDs: utils.NewSeenMap(), + } +} + +func (uds *UniqueDocuments) Add(d *Document) { + if uds.docIDs.Seen(d.ID()) { + return + } + uds.docs = append(uds.docs, d) + uds.docIDs.Add(d.ID()) +} + +func (uds *UniqueDocuments) AddDocuments(docs []*Document) { + for _, d := range docs { + uds.Add(d) + } +} + +func (uds *UniqueDocuments) Documents() []*Document { + return uds.docs +} diff --git a/api/internal/crawl/index/kustomize.go b/api/internal/crawl/index/kustomize.go index e55c5547e..7e2e7b104 100644 --- a/api/internal/crawl/index/kustomize.go +++ b/api/internal/crawl/index/kustomize.go @@ -18,6 +18,7 @@ const ( ) type Mode int + const ( InsertOrUpdate = iota Delete diff --git a/api/internal/crawl/utils/utils.go b/api/internal/crawl/utils/utils.go new file mode 100644 index 000000000..a397b2d52 --- /dev/null +++ b/api/internal/crawl/utils/utils.go @@ -0,0 +1,16 @@ +package utils + +type SeenMap map[string]struct{} + +func (seen SeenMap) Seen(item string) bool { + _, ok := seen[item] + return ok +} + +func (seen SeenMap) Add(item string) { + seen[item] = struct{}{} +} + +func NewSeenMap() SeenMap { + return make(map[string]struct{}) +}