diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 7861749f0..9a6a401d6 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -148,6 +148,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C continue } + if tail.User == "" { + tail.User = doc.UserName(tail.RepositoryURL) + } + // If the Document represents a kustomization root, FetchDcoument will change // the `filePath` field of the Document by adding `kustomization.yaml` or // `kustomization.yml` or `kustomization` into the the field. diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 70c1a23ca..f1b723c5b 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -284,6 +284,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, FilePath: k.Path, DefaultBranch: defaultBranch, RepositoryURL: k.Repository.URL, + User: doc.UserName(k.Repository.URL), } if seen.Seen(document.ID()) { @@ -301,6 +302,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, FilePath: k.Path, DefaultBranch: defaultBranch, RepositoryURL: k.Repository.URL, + User: doc.UserName(k.Repository.URL), }, } creationTime, err := gcl.GetFileCreationTime(k) diff --git a/api/internal/crawl/doc/doc_test.go b/api/internal/crawl/doc/doc_test.go index e66241287..0bc7bf3b1 100644 --- a/api/internal/crawl/doc/doc_test.go +++ b/api/internal/crawl/doc/doc_test.go @@ -216,22 +216,26 @@ resources: RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/base", FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/otherbase", FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", FileType: "resource", + User: "sigs.k8s.io", }, { RepositoryURL: "https://github.com/kubernetes-sigs/kustomize", FilePath: "examples/helloWorld", DefaultBranch: "v3.1.0", FileType: "resource", + User: "kubernetes-sigs", }, }, }, @@ -317,11 +321,13 @@ transformers: RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/gen.yaml", FileType: "generator", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", FileType: "resource", + User: "sigs.k8s.io", }, }, }, @@ -352,16 +358,19 @@ transformers: RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/tr.yaml", FileType: "transformer", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/gen.yaml", FileType: "generator", + User: "sigs.k8s.io", }, { RepositoryURL: "sigs.k8s.io/kustomize", FilePath: "some/path/to/kdir/file.yaml", FileType: "resource", + User: "sigs.k8s.io", }, }, }, diff --git a/api/internal/crawl/doc/docname.go b/api/internal/crawl/doc/docname.go index a3cab942a..fe1698943 100644 --- a/api/internal/crawl/doc/docname.go +++ b/api/internal/crawl/doc/docname.go @@ -11,7 +11,10 @@ import ( ) type Document struct { - RepositoryURL string `json:"repositoryUrl,omitempty"` + RepositoryURL string `json:"repositoryUrl,omitempty"` + // User makes it easy to aggregate data in the user level instead + // of the repository level + User string `json:"user,omitempty"` FilePath string `json:"filePath,omitempty"` DefaultBranch string `json:"defaultBranch,omitempty"` DocumentData string `json:"document,omitempty"` @@ -30,6 +33,7 @@ func (doc *Document) GetDocument() *Document { func (doc *Document) Copy() *Document { return &Document{ RepositoryURL: doc.RepositoryURL, + User: doc.User, FilePath: doc.FilePath, DefaultBranch: doc.DefaultBranch, DocumentData: doc.DocumentData, @@ -56,6 +60,7 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) { RepositoryURL: repoSpec.Host + path.Clean(repoSpec.OrgRepo), FilePath: path.Clean(repoSpec.Path), DefaultBranch: repoSpec.Ref, + User: UserName(repoSpec.Host + path.Clean(repoSpec.OrgRepo)), }, nil } // else document is probably relative path. @@ -63,6 +68,7 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) { ret := Document{ RepositoryURL: doc.RepositoryURL, DefaultBranch: doc.DefaultBranch, + User: UserName(doc.RepositoryURL), } ogDir, _ := path.Split(doc.FilePath) @@ -87,13 +93,7 @@ func (doc *Document) ID() string { } func (doc *Document) RepositoryFullName() string { - url := strings.TrimRight(doc.RepositoryURL, "/") - - gitPrefix := "git@github.com:" - if strings.HasPrefix(url, gitPrefix) { - url = url[len(gitPrefix):] - } - + url := TrimUrl(doc.RepositoryURL) sections := strings.Split(url, "/") l := len(sections) if l < 2 { @@ -101,3 +101,24 @@ func (doc *Document) RepositoryFullName() string { } return path.Join(sections[l-2], sections[l-1]) } + +// TrimUrl removes all the trailing slashes and the "git@github.com:" prefix (if exists). +func TrimUrl(s string) string { + url := strings.TrimRight(s, "/") + + gitPrefix := "git@github.com:" + if strings.HasPrefix(url, gitPrefix) { + url = url[len(gitPrefix):] + } + return url +} + +func UserName(repositoryURL string) string { + url := TrimUrl(repositoryURL) + sections := strings.Split(url, "/") + l := len(sections) + if l < 2 { + return url + } + return sections[l-2] +} diff --git a/api/internal/crawl/doc/docname_test.go b/api/internal/crawl/doc/docname_test.go index a03beaf06..00ae08cff 100644 --- a/api/internal/crawl/doc/docname_test.go +++ b/api/internal/crawl/doc/docname_test.go @@ -28,6 +28,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/other/file/resource.yaml", DefaultBranch: "master", + User: "example.com", }, }, { @@ -36,6 +37,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/other/file/patch.yaml", DefaultBranch: "master", + User: "example.com", }, }, { @@ -44,6 +46,7 @@ func TestFromRelativePath(t *testing.T) { RepositoryURL: "example.com/repo", FilePath: "path/to/file/service.yaml", DefaultBranch: "master", + User: "example.com", }, }, }, @@ -109,3 +112,39 @@ func TestDocument_RepositoryFullName(t *testing.T) { } } } + +func TestDocument_UserName(t *testing.T) { + testCases := []struct { + repositoryURL string + expectedUserName string + }{ + { + repositoryURL: "https://github.com/user/repo", + expectedUserName: "user", + }, + { + repositoryURL: "https://github.com//user/repo////", + expectedUserName: "user", + }, + { + repositoryURL: "repo/", + expectedUserName: "repo", + }, + { + repositoryURL: "", + expectedUserName: "", + }, + { + repositoryURL: "git@github.com:user/repo", + expectedUserName: "user", + }, + } + + for _, tc := range testCases { + returnedUserName := UserName(tc.repositoryURL) + if returnedUserName != tc.expectedUserName { + t.Errorf("UserName expected %s, got %s", + tc.expectedUserName, returnedUserName) + } + } +} diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index ebd6b70c0..a1f465a36 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -20,6 +20,9 @@ const IndexConfig = ` "repositoryUrl": { "type": "keyword" }, + "user": { + "type": "keyword" + }, "filePath": { "type": "keyword" }, diff --git a/api/internal/crawl/search_cmds/id.md b/api/internal/crawl/search_cmds/id.md new file mode 100644 index 000000000..66d767eae --- /dev/null +++ b/api/internal/crawl/search_cmds/id.md @@ -0,0 +1,12 @@ +Find the document with the given `_id`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "terms": { + "_id": [ "b3a03f3327841617db696e2d6abc30e1a1bd653f1a2bbce05637f7dcae1a43f7" ] + } + } +} +' +``` diff --git a/api/internal/crawl/search_cmds/user.md b/api/internal/crawl/search_cmds/user.md new file mode 100644 index 000000000..b8ae3538c --- /dev/null +++ b/api/internal/crawl/search_cmds/user.md @@ -0,0 +1,123 @@ +Find all the documents having the `user` field set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "exists": { + "field": "user" + } + } +} +' +``` + +Find all the documents whose `user` field is not set: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 10000, + "query": { + "bool": { + "must_not": { + "exists": { + "field": "user" + } + } + } + } +} +' +``` + +Search for all the documents whose `user` field is `kubernetes-sigs`: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "user": "kubernetes-sigs" }} + ] + } + } +} +' +``` + +Count distinct values of the `user` field: +``` +curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "user_count" : { + "cardinality" : { + "field" : "user", + "precision_threshold": 40000 + } + } + } +} +' +``` + +List all the values of the `user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "aggs" : { + "user" : { + "terms" : { + "field" : "user" + } + } + } +} +' +``` + + +For all the kustomization files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "filter": [ + { "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }} + ] + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user" + } + } + } +} +' +``` + +For all the non-kustomization files in the index, list all the values of the +`user` field and the frequency of each value: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "bool": { + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + } + } + }, + "aggs" : { + "user" : { + "terms" : { + "field" : "user" + } + } + } +} +' +```