diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 2d301efe1..e534780d2 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -126,7 +126,7 @@ func main() { } } - // seen tracks the IDs of all the documents in the index. + // seen tracks the IDs of all the documents in the index and their corresponding file types. // This helps avoid indexing a given document multiple times. seen := utils.NewSeenMap() @@ -187,6 +187,12 @@ func main() { crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) case CrawlGithub: crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + // add all the documents in the index into seen. + // this greatly reduces the time overhead of CrawlGithub. + getSeedDocsFunc() + for _, d := range seedDocs { + seen[d.ID()] = d.FileType + } crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUser: if *githubUserPtr == "" { diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 11715616d..7488fcf3d 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -38,6 +38,8 @@ type Crawler interface { // Write to the document what the created time is. SetCreated(context.Context, *doc.Document) error + SetDefaultBranch(*doc.Document) + Match(*doc.Document) bool } @@ -78,7 +80,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler { func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen utils.SeenMap, stack *CrawlSeed) { - seen.Add(cdoc.ID()) + seen.Set(cdoc.ID(), cdoc.GetDocument().FileType) + + match.SetDefaultBranch(cdoc.GetDocument()) // Insert into index if err := indx(cdoc, index.InsertOrUpdate); err != nil { @@ -87,14 +91,14 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, return } - deps, err := cdoc.GetResources(true, false, false) + deps, err := cdoc.GetResources(true, true, true) if err != nil { logger.Println(err) return } for _, dep := range deps { - if seen.Seen(dep.ID()) { + if seen.Seen(dep.ID()) && seen.Value(dep.ID()) == dep.FileType { continue } *stack = append(*stack, dep) @@ -102,7 +106,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, } func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, - seen utils.SeenMap, stack *CrawlSeed) { + seen utils.SeenMap, stack *CrawlSeed, refreshDoc bool, updateFileType bool) { UpdatedDocCount := 0 seenDocCount := 0 @@ -126,9 +130,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("Crawling doc %d: %s", crawledDocCount, tail.Path()) if seen.Seen(tail.ID()) { - logger.Printf("this doc has been seen before") - seenDocCount++ - continue + if !updateFileType || seen.Value(tail.ID()) == tail.FileType { + logger.Printf("this doc has been seen before") + seenDocCount++ + continue + } } if tail.WasCached() { @@ -144,6 +150,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C continue } + if tail.User == "" { + tail.User = doc.UserName(tail.RepositoryURL) + } + // If the Document represents a kustomization root, FetchDcoument will change // the `filePath` field of the Document by adding `kustomization.yaml` or // `kustomization.yml` or `kustomization` into the the field. @@ -151,26 +161,34 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // calling FetchDocument. Otherwise, the binary may enter into an infinite loop // if a kustomization file points to its kustmozation root in its `resources` or // `bases` field. - seen.Add(tail.ID()) + seen.Set(tail.ID(), tail.FileType) - if err := match.FetchDocument(ctx, tail); err != nil { - logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err) - FetchDocumentErrCount++ - // delete the document from the index - cdoc := &doc.KustomizationDocument{ - Document: *tail, - } - seen.Add(cdoc.ID()) - if err := indx(cdoc, index.Delete); err != nil { - logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err) - } - deleteDocCount++ - continue + if refreshDoc || tail.DefaultBranch == "" { + match.SetDefaultBranch(tail) } - if err := match.SetCreated(ctx, tail); err != nil { - logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err) - SetCreatedErrCount++ + if refreshDoc || tail.DocumentData == "" { + if err := match.FetchDocument(ctx, tail); err != nil { + logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err) + FetchDocumentErrCount++ + // delete the document from the index + cdoc := &doc.KustomizationDocument{ + Document: *tail, + } + seen.Set(cdoc.ID(), tail.FileType) + if err := indx(cdoc, index.Delete); err != nil { + logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err) + } + deleteDocCount++ + continue + } + } + + if refreshDoc || tail.CreationTime == nil { + if err := match.SetCreated(ctx, tail); err != nil { + logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err) + SetCreatedErrCount++ + } } cdoc, err := conv(tail) @@ -206,14 +224,14 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, // Exploit seed to update bulk of corpus. logger.Printf("updating %d documents from seed\n", len(seed)) // each unique document in seed will be crawled once. - doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack) + doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false) // Traverse any new documents added while updating corpus. logger.Printf("crawling %d new documents found in the seed\n", len(stack)) // While crawling each document in stack, the documents directly referred in the document // will be added into stack. // After this statement is done, stack will become empty. - doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack) + doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true) } // CrawlGithubRunner is a blocking function and only returns once all of the @@ -294,6 +312,8 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, for cdoc := range ch { docCount++ logger.Printf("Processing doc %d found on Github", docCount) + // all the docs here are kustomization files found by querying Github, and + // their `FileType` fields all should be empty. if seen.Seen(cdoc.ID()) { logger.Printf("the doc has been seen before") continue @@ -320,5 +340,5 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, // Handle deps of newly discovered documents. logger.Printf("crawling the %d new documents referred by other documents", len(stack)) - doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack) + doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true) } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index d18a4afce..5e93d5bcb 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -37,6 +37,8 @@ func (c testCrawler) Match(d *doc.Document) bool { return d != nil } +func (c testCrawler) SetDefaultBranch(d *doc.Document) {} + func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error { if i, ok := c.lukp[d.ID()]; ok { d.DocumentData = c.docs[i].DocumentData diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index e1aeff401..21cfd6f81 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -60,8 +60,16 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client, } } -func (gc githubCrawler) SetDefaultBranch(repo, branch string) { - gc.branchMap[repo] = branch +func (gc githubCrawler) SetDefaultBranch(d *doc.Document) { + url := gc.client.ReposRequest(d.RepositoryFullName()) + defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap) + if err != nil { + logger.Printf( + "(error: %v) setting default_branch to master\n", err) + defaultBranch = "master" + } + d.DefaultBranch = defaultBranch + gc.branchMap[d.RepositoryURL] = d.DefaultBranch } func (gc githubCrawler) DefaultBranch(repo string) string { @@ -79,10 +87,20 @@ func (gc githubCrawler) Crawl(ctx context.Context, accessToken: gc.client.accessToken, } + var ranges []string + var err error // Since Github returns a max of 1000 results per query, we can use // multiple queries that split the search space into chunks of at most // 1000 files to get all of the data. - ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query)) + for i := 0; i < 5; i++ { + ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query)) + if err == nil { + logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i) + break + } else { + time.Sleep(time.Minute) + } + } if err != nil { return fmt.Errorf("could not split %v into ranges, %v\n", gc.query, err) @@ -114,19 +132,6 @@ func (gc githubCrawler) Crawl(ctx context.Context, // it will try to add each string in konfig.RecognizedKustomizationFileNames() to // d.FilePath, and try to fetch the document again. func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error { - // set the default branch if it is empty - if d.DefaultBranch == "" { - url := gc.client.ReposRequest(d.RepositoryFullName()) - defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap) - if err != nil { - logger.Printf( - "(error: %v) setting default_branch to master\n", err) - defaultBranch = "master" - } - d.DefaultBranch = defaultBranch - } - gc.SetDefaultBranch(d.RepositoryURL, d.DefaultBranch) - repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch repoSpec, err := git.NewRepoSpecFromUrl(repoURL) if err != nil { @@ -283,10 +288,13 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, defaultBranch = "master" } + // document here is a kustomization file found by querying Github, whose + // `FileType` field should be empty. document := doc.Document{ FilePath: k.Path, DefaultBranch: defaultBranch, RepositoryURL: k.Repository.URL, + User: doc.UserName(k.Repository.URL), } if seen.Seen(document.ID()) { @@ -304,6 +312,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, FilePath: k.Path, DefaultBranch: defaultBranch, RepositoryURL: k.Repository.URL, + User: doc.UserName(k.Repository.URL), }, } creationTime, err := gcl.GetFileCreationTime(k) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 87cd5a3e2..0e5965ac1 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -87,17 +87,17 @@ func (doc *KustomizationDocument) GetResources( res := make([]*Document, 0) if includeResources { - resourceDocs := doc.CollectDocuments(k.Resources) + resourceDocs := doc.CollectDocuments(k.Resources, "resource") res = append(res, resourceDocs...) } if includeGenerators { - generatorDocs := doc.CollectDocuments(k.Generators) + generatorDocs := doc.CollectDocuments(k.Generators, "generator") res = append(res, generatorDocs...) } if includeTransformers { - transformerDocs := doc.CollectDocuments(k.Transformers) + transformerDocs := doc.CollectDocuments(k.Transformers, "transformer") res = append(res, transformerDocs...) } @@ -106,7 +106,8 @@ func (doc *KustomizationDocument) GetResources( // CollectDocuments construct a Document for each path in paths, and return // a slice of Document pointers. -func (doc *KustomizationDocument) CollectDocuments(paths []string) []*Document { +func (doc *KustomizationDocument) CollectDocuments( + paths []string, fileType string) []*Document { docs := make([]*Document, 0, len(paths)) for _, r := range paths { if strings.TrimSpace(r) == "" { @@ -117,6 +118,7 @@ func (doc *KustomizationDocument) CollectDocuments(paths []string) []*Document { log.Printf("CollectDocuments error: %v\n", err) continue } + next.FileType = fileType docs = append(docs, &next) } return docs diff --git a/api/internal/crawl/doc/doc_test.go b/api/internal/crawl/doc/doc_test.go index c193809a6..0bc7bf3b1 100644 --- a/api/internal/crawl/doc/doc_test.go +++ b/api/internal/crawl/doc/doc_test.go @@ -215,19 +215,27 @@ resources: { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/base", + FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/otherbase", + FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", + FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "https://github.com/kubernetes-sigs/kustomize", FilePath: "examples/helloWorld", DefaultBranch: "v3.1.0", + FileType: "resource", + User: "kubernetes-sigs", }, }, }, @@ -312,10 +320,14 @@ transformers: { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/gen.yaml", + FileType: "generator", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", + FileType: "resource", + User: "sigs.k8s.io", }, }, }, @@ -345,14 +357,20 @@ transformers: { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/tr.yaml", + FileType: "transformer", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/gen.yaml", + FileType: "generator", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", + FileType: "resource", + User: "sigs.k8s.io", }, }, }, diff --git a/api/internal/crawl/doc/docname.go b/api/internal/crawl/doc/docname.go index e295e4620..fe1698943 100644 --- a/api/internal/crawl/doc/docname.go +++ b/api/internal/crawl/doc/docname.go @@ -11,12 +11,18 @@ import ( ) type Document struct { - RepositoryURL string `json:"repositoryUrl,omitempty"` + RepositoryURL string `json:"repositoryUrl,omitempty"` + // User makes it easy to aggregate data in the user level instead + // of the repository level + User string `json:"user,omitempty"` FilePath string `json:"filePath,omitempty"` DefaultBranch string `json:"defaultBranch,omitempty"` DocumentData string `json:"document,omitempty"` CreationTime *time.Time `json:"creationTime,omitempty"` IsSame bool `json:"-"` + // FileType can be one of the following: + // "generator", "transformer", "resource", "". + FileType string `json:"fileType,omitempty"` } // Implements the CrawlerDocument interface. @@ -27,11 +33,13 @@ func (doc *Document) GetDocument() *Document { func (doc *Document) Copy() *Document { return &Document{ RepositoryURL: doc.RepositoryURL, + User: doc.User, FilePath: doc.FilePath, DefaultBranch: doc.DefaultBranch, DocumentData: doc.DocumentData, CreationTime: doc.CreationTime, IsSame: doc.IsSame, + FileType: doc.FileType, } } @@ -52,6 +60,7 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) { RepositoryURL: repoSpec.Host + path.Clean(repoSpec.OrgRepo), FilePath: path.Clean(repoSpec.Path), DefaultBranch: repoSpec.Ref, + User: UserName(repoSpec.Host + path.Clean(repoSpec.OrgRepo)), }, nil } // else document is probably relative path. @@ -59,6 +68,7 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) { ret := Document{ RepositoryURL: doc.RepositoryURL, DefaultBranch: doc.DefaultBranch, + User: UserName(doc.RepositoryURL), } ogDir, _ := path.Split(doc.FilePath) @@ -83,13 +93,7 @@ func (doc *Document) ID() string { } func (doc *Document) RepositoryFullName() string { - url := strings.TrimRight(doc.RepositoryURL, "/") - - gitPrefix := "git@github.com:" - if strings.HasPrefix(url, gitPrefix) { - url = url[len(gitPrefix):] - } - + url := TrimUrl(doc.RepositoryURL) sections := strings.Split(url, "/") l := len(sections) if l < 2 { @@ -97,3 +101,24 @@ func (doc *Document) RepositoryFullName() string { } return path.Join(sections[l-2], sections[l-1]) } + +// TrimUrl removes all the trailing slashes and the "git@github.com:" prefix (if exists). +func TrimUrl(s string) string { + url := strings.TrimRight(s, "/") + + gitPrefix := "git@github.com:" + if strings.HasPrefix(url, gitPrefix) { + url = url[len(gitPrefix):] + } + return url +} + +func UserName(repositoryURL string) string { + url := TrimUrl(repositoryURL) + sections := strings.Split(url, "/") + l := len(sections) + if l < 2 { + return url + } + return sections[l-2] +} diff --git a/api/internal/crawl/doc/docname_test.go b/api/internal/crawl/doc/docname_test.go index a03beaf06..00ae08cff 100644 --- a/api/internal/crawl/doc/docname_test.go +++ b/api/internal/crawl/doc/docname_test.go @@ -28,6 +28,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/other/file/resource.yaml", DefaultBranch: "master", + User: "example.com", }, }, { @@ -36,6 +37,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/other/file/patch.yaml", DefaultBranch: "master", + User: "example.com", }, }, { @@ -44,6 +46,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/file/service.yaml", DefaultBranch: "master", + User: "example.com", }, }, }, @@ -109,3 +112,39 @@ func TestDocument_RepositoryFullName(t *testing.T) { } } } + +func TestDocument_UserName(t *testing.T) { + testCases := []struct { + repositoryURL string + expectedUserName string + }{ + { + repositoryURL: "https://github.com/user/repo", + expectedUserName: "user", + }, + { + repositoryURL: "https://github.com//user/repo////", + expectedUserName: "user", + }, + { + repositoryURL: "repo/", + expectedUserName: "repo", + }, + { + repositoryURL: "", + expectedUserName: "", + }, + { + repositoryURL: "git@github.com:user/repo", + expectedUserName: "user", + }, + } + + for _, tc := range testCases { + returnedUserName := UserName(tc.repositoryURL) + if returnedUserName != tc.expectedUserName { + t.Errorf("UserName expected %s, got %s", + tc.expectedUserName, returnedUserName) + } + } +} diff --git a/api/internal/crawl/doc/unique_doc.go b/api/internal/crawl/doc/unique_doc.go index 026b345a5..da5294b1f 100644 --- a/api/internal/crawl/doc/unique_doc.go +++ b/api/internal/crawl/doc/unique_doc.go @@ -22,7 +22,7 @@ func (uds *UniqueDocuments) Add(d *Document) { return } uds.docs = append(uds.docs, d) - uds.docIDs.Add(d.ID()) + uds.docIDs.Set(d.ID(), "") } func (uds *UniqueDocuments) AddDocuments(docs []*Document) { diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index 7d7ce2b9e..a1f465a36 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -20,12 +20,18 @@ const IndexConfig = ` "repositoryUrl": { "type": "keyword" }, + "user": { + "type": "keyword" + }, "filePath": { "type": "keyword" }, "defaultBranch": { "type": "keyword" }, + "fileType": { + "type": "keyword" + }, "document": { "type": "text" }, diff --git a/api/internal/crawl/search_cmds/creationTime.md b/api/internal/crawl/search_cmds/creationTime.md index 3ebfaf157..08b6b3e1e 100644 --- a/api/internal/crawl/search_cmds/creationTime.md +++ b/api/internal/crawl/search_cmds/creationTime.md @@ -27,7 +27,7 @@ curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'C "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } }, @@ -45,8 +45,53 @@ curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'C "query": { "bool": { "must_not": { - "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" } - } + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + }, + "aggs" : { + "min_creationTime" : { "min" : { "field" : "creationTime" } } + } +} +' +``` + +Find out the smallest value of the `creationTime` field of all kustomize generator files: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "min_creationTime" : { "min" : { "field" : "creationTime" } } + } +} +' +``` + +Find out the smallest value of the `creationTime` field of all kustomize transformer files: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] } }, "aggs" : { @@ -87,6 +132,30 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-T ' ``` +Query all the kustomization files whose `creationTime` falls within the specific range: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 20, + "query": { + "bool": { + "filter": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "must": { + "range": { + "creationTime": { + "gte": "2017-09-24T15:49:57.000Z", + "lte": "2017-09-24T15:49:57.000Z" + } + } + } + } + } +} +' +``` + Aggregate how many new kustomization files were added into Github each month: ``` curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' @@ -94,7 +163,7 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } }, @@ -117,7 +186,62 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "must_not": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + }, + "aggs" : { + "newFiles_over_time" : { + "date_histogram" : { + "field" : "creationTime", + "interval" : "month" + } + } + } +} +' +``` + +Aggregate how many new kustomize generator files were added into Github each month: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "newFiles_over_time" : { + "date_histogram" : { + "field" : "creationTime", + "interval" : "month" + } + } + } +} +' +``` + +Aggregate how many new kustomize transformer files were added into Github each month: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "transformer" }} ] } }, @@ -140,7 +264,7 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } }, @@ -163,8 +287,11 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "must_not": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} - ] + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "resource" }} + ] } }, "aggs" : { @@ -177,4 +304,108 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co } } ' +``` + +Aggregate how many new kustomize generator files were added into Github each year: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "newFiles_over_time" : { + "date_histogram" : { + "field" : "creationTime", + "interval" : "year" + } + } + } +} +' +``` + +Aggregate how many new kustomize transformer files were added into Github each year: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ], + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] + } + }, + "aggs" : { + "newFiles_over_time" : { + "date_histogram" : { + "field" : "creationTime", + "interval" : "year" + } + } + } +} +' +``` + +Find the generator files created within the given time range: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "must": { + "range": { + "creationTime": { + "gte": "2019-04-26T16:40:02.000Z", + "lte": "2019-04-26T16:40:02.000Z" + } + } + } + } + } +} +' +``` + +Find the transformer files created within the given time range: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "must": { + "range": { + "creationTime": { + "gte": "2019-04-26T16:40:02.000Z", + "lte": "2019-04-26T16:40:02.000Z" + } + } + } + } + } +} +' ``` \ No newline at end of file diff --git a/api/internal/crawl/search_cmds/fileType.md b/api/internal/crawl/search_cmds/fileType.md new file mode 100644 index 000000000..3be7b41e1 --- /dev/null +++ b/api/internal/crawl/search_cmds/fileType.md @@ -0,0 +1,301 @@ +Find all the documents having the `fileType` field set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "exists": { + "field": "fileType" + } + } +} +' +``` + +Find all the documents whose `fileType` field is not set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 10000, + "query": { + "bool": { + "must_not": { + "exists": { + "field": "fileType" + } + } + } + } +} +' +``` + +Search for all the documents whose `fileType` field is `resource`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + } +} +' +``` + +Search for all the kustomization files whose `fileType` field is `resource`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }}, + { "regexp": { "fileType": "resource" }} + ] + } + } +} +' +``` + +Search for all the kustomize resource files: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "resource" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + } +} +' +``` + +Search all the kustomization files including a `generators` field: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 10000, + "query": { + "bool": { + "must": { + "match" : { + "identifiers" : { + "query" : "generators" + } + } + }, + "filter": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + } +} +' +``` + +Search for all the documents whose `fileType` field is `generator`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + } +} +' +``` + +Search for all the kustomization files whose `fileType` field is `generator`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }}, + { "regexp": { "fileType": "generator" }} + ] + } + } +} +' +``` + +Search for all the kustomize generator files: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + } +} +' +``` + +Search all the kustomization files including a `transformers` field: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 10000, + "query": { + "bool": { + "must": { + "match" : { + "identifiers" : { + "query" : "transformers" + } + } + }, + "filter": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + } +} +' +``` + +Search for all the documents whose `fileType` field is `transformer`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] + } + } +} +' +``` + +Search for all the kustomization files whose `fileType` field is `transformer`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }}, + { "regexp": { "fileType": "transformer" }} + ] + } + } +} +' +``` + +Search for all the kustomize transformer files: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + } +} +' +``` + +Count distinct values of the `fileType` field: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "fileType_count" : { + "cardinality" : { + "field" : "fileType", + "precision_threshold": 40000 + } + } + } +} +' +``` + +List all the values of the `fileType` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "fileType" : { + "terms" : { + "field" : "fileType" + } + } + } +} +' +``` + + +For all the kustomization files in the index, list all the values of the +`fileType` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "fileType" : { + "terms" : { + "field" : "fileType" + } + } + } +} +' +``` + +For all the non-kustomization files in the index, list all the values of the +`fileType` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + }, + "aggs" : { + "fileType" : { + "terms" : { + "field" : "fileType" + } + } + } +} +' +``` diff --git a/api/internal/crawl/search_cmds/id.md b/api/internal/crawl/search_cmds/id.md new file mode 100644 index 000000000..66d767eae --- /dev/null +++ b/api/internal/crawl/search_cmds/id.md @@ -0,0 +1,12 @@ +Find the document with the given `_id`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "terms": { + "_id": [ "b3a03f3327841617db696e2d6abc30e1a1bd653f1a2bbce05637f7dcae1a43f7" ] + } + } +} +' +``` diff --git a/api/internal/crawl/search_cmds/keyword_search.md b/api/internal/crawl/search_cmds/keyword_search.md index 588f938fb..99a677813 100644 --- a/api/internal/crawl/search_cmds/keyword_search.md +++ b/api/internal/crawl/search_cmds/keyword_search.md @@ -57,7 +57,7 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-T "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } } @@ -73,7 +73,7 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-T "query": { "bool": { "must_not": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } } diff --git a/api/internal/crawl/search_cmds/misc.md b/api/internal/crawl/search_cmds/misc.md index 303ae5d3b..d68736ea2 100644 --- a/api/internal/crawl/search_cmds/misc.md +++ b/api/internal/crawl/search_cmds/misc.md @@ -16,4 +16,17 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_mapping?pretty" Delete the kustomize index from the ElasticSearch cluster (**Use this command with caution**): ``` curl -X DELETE "${ElasticSearchURL}:9200/${INDEXNAME}?pretty" +``` + +Add a new field into an existing index. +``` +curl -X PUT "${ElasticSearchURL}:9200/${INDEXNAME}/_mapping/_doc?pretty" -H 'Content-Type: application/json' -d' +{ + "properties": { + "fileType": { + "type": "keyword" + } + } +} +' ``` \ No newline at end of file diff --git a/api/internal/crawl/search_cmds/repositoryUrl.md b/api/internal/crawl/search_cmds/repositoryUrl.md index 291aa1c69..37e99d088 100644 --- a/api/internal/crawl/search_cmds/repositoryUrl.md +++ b/api/internal/crawl/search_cmds/repositoryUrl.md @@ -21,7 +21,7 @@ curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'C "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } }, @@ -37,16 +37,143 @@ curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'C ' ``` -Count how many Github repositories include kustomize resource files: +Count distinct values of the `repositoryUrl` field for all the kustomize resource files in the index: ``` curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' { "query": { "bool": { "must_not": { - "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" } + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + }, + "aggs" : { + "repositoryUrl_count" : { + "cardinality" : { + "field" : "repositoryUrl", + "precision_threshold": 40000 } } + } +} +' +``` + +Count distinct values of the `repositoryUrl` field for all the kustomize generator files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "repositoryUrl_count" : { + "cardinality" : { + "field" : "repositoryUrl", + "precision_threshold": 40000 + } + } + } +} +' +``` + +Count distinct values of the `repositoryUrl` field for all the kustomize transformer files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] + } + }, + "aggs" : { + "repositoryUrl_count" : { + "cardinality" : { + "field" : "repositoryUrl", + "precision_threshold": 40000 + } + } + } +} +' +``` + +Count distinct values of the `repositoryUrl` field for all the kustomize resource dirs in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "resource" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "repositoryUrl_count" : { + "cardinality" : { + "field" : "repositoryUrl", + "precision_threshold": 40000 + } + } + } +} +' +``` + +Count distinct values of the `repositoryUrl` field for all the kustomize generator dirs in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "repositoryUrl_count" : { + "cardinality" : { + "field" : "repositoryUrl", + "precision_threshold": 40000 + } + } + } +} +' +``` + +Count distinct values of the `repositoryUrl` field for all the kustomize transformer dirs in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } }, "aggs" : { "repositoryUrl_count" : { @@ -85,7 +212,7 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "filter": [ - { "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }} + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} ] } }, @@ -108,8 +235,11 @@ curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Co "query": { "bool": { "must_not": { - "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" } - } + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "resource" }} + ] } }, "aggs" : { diff --git a/api/internal/crawl/search_cmds/user.md b/api/internal/crawl/search_cmds/user.md new file mode 100644 index 000000000..83f4cd2e5 --- /dev/null +++ b/api/internal/crawl/search_cmds/user.md @@ -0,0 +1,380 @@ +Find all the documents having the `user` field set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "exists": { + "field": "user" + } + } +} +' +``` + +Find all the documents whose `user` field is not set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 10000, + "query": { + "bool": { + "must_not": { + "exists": { + "field": "user" + } + } + } + } +} +' +``` + +Search for all the documents whose `user` field is `kubernetes-sigs`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "user": "kubernetes-sigs" }} + ] + } + } +} +' +``` + +Count distinct values of the `user` field: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +List all the values of the `user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size" : 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomization files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomization files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size": 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomize resource files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomize resource files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "resource" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size": 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomize generator files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomize generator files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "generator" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size": 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomize transformer files in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomize transformer files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "filter": [ + { "regexp": { "fileType": "transformer" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size": 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomize generator dirs in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomize generator dirs in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user", + "size": 20 + } + } + } +} +' +``` + +Count distinct values of the `user` field for all the kustomize transformer dirs in the index: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +For all the kustomize transformer dirs in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }}, + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user" + } + } + } +} +' +``` \ No newline at end of file diff --git a/api/internal/crawl/utils/utils.go b/api/internal/crawl/utils/utils.go index a397b2d52..d6b6fab68 100644 --- a/api/internal/crawl/utils/utils.go +++ b/api/internal/crawl/utils/utils.go @@ -1,16 +1,21 @@ package utils -type SeenMap map[string]struct{} +type SeenMap map[string]string func (seen SeenMap) Seen(item string) bool { _, ok := seen[item] return ok } -func (seen SeenMap) Add(item string) { - seen[item] = struct{}{} +func (seen SeenMap) Set(k, v string) { + seen[k] = v +} + +// The caller should make sure that key is in the map. +func (seen SeenMap) Value(k string) string { + return seen[k] } func NewSeenMap() SeenMap { - return make(map[string]struct{}) + return make(map[string]string) }