From 29e50ab476eb36da11755bde4bf7c02e5532ef9b Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Fri, 10 Jan 2020 10:23:20 -0800 Subject: [PATCH] Collect stats on generators and transformers --- .../crawl/cmd/kustomize_stats/main.go | 186 +++++++++++++++++- .../config/crawler/kustomize_stats/job.yaml | 5 + api/internal/crawl/doc/docname.go | 5 + 3 files changed, 193 insertions(+), 3 deletions(-) diff --git a/api/internal/crawl/cmd/kustomize_stats/main.go b/api/internal/crawl/cmd/kustomize_stats/main.go index 4f8fc195f..234d1327c 100644 --- a/api/internal/crawl/cmd/kustomize_stats/main.go +++ b/api/internal/crawl/cmd/kustomize_stats/main.go @@ -5,14 +5,24 @@ import ( "flag" "fmt" "log" + "net/http" + "os" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sort" "time" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" + "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/index" ) +const ( + githubAccessTokenVar = "GITHUB_ACCESS_TOKEN" + retryCount = 3 +) + // iterateArr adds each item in arr into countMap. func iterateArr(arr []string, countMap map[string]int) { for _, item := range arr { @@ -37,6 +47,159 @@ func SortMapKeyByValue(m map[string]int) []string { return keys } +func GeneratorAndTransformerStats(ctx context.Context, + generatorDocs []*doc.Document, transformerDocs []*doc.Document, + idx *index.KustomizeIndex) { + // allGenerators includes all the documents referred in the generators field + allGenerators := crawler.NewUniqueDocuments() + + // allTransformers includes all the documents referred in the transformers field + allTransformers := crawler.NewUniqueDocuments() + + // docUsingGeneratorCount counts the number of the kustomization files using generators + docUsingGeneratorCount := 0 + + // docUsingTransformerCount counts the number of the kustomization files using transformers + docUsingTransformerCount := 0 + + // collect all the documents referred in the generators and transformers fields + for _, d := range generatorDocs { + kdoc := doc.KustomizationDocument{ + Document: *d, + } + generators, err := kdoc.GetResources(false, false, true) + if err != nil { + log.Printf("failed to parse the generators field of the Document (%s): %v", + d.Path(), err) + } + if len(generators) > 0 { + docUsingGeneratorCount++ + allGenerators.AddDocuments(generators) + } + } + + for _, d := range transformerDocs { + kdoc := doc.KustomizationDocument{ + Document: *d, + } + transformers, err := kdoc.GetResources(false, true, false) + if err != nil { + log.Printf("failed to parse the transformers field of the Document (%s): %v", + d.Path(), err) + } + if len(transformers) > 0 { + docUsingTransformerCount++ + allTransformers.AddDocuments(transformers) + } + } + + // fileGeneratorCount counts file-type generators + // dirGeneratorCount counts dir-type generators + fileGeneratorCount, dirGeneratorCount, generatorFiles, generatorDirs := DocumentTypeSummary(ctx, allGenerators.Documents()) + + // fileTransformerCount counts file-type transformers + // dirTransformerCount counts dir-type transformers + fileTransformerCount, dirTransformerCount, transformerFiles, transformerDirs := DocumentTypeSummary(ctx, allTransformers.Documents()) + + // check whether any of the generator files are not in the index + nonExistGeneratorFileCount := ExistInIndex(idx, generatorFiles, "generator file ") + // check whether any of the generator dirs are not in the index + nonExistGeneratorDirCount := ExistInIndex(idx, generatorDirs, "generator dir ") + + // check whether any of the transformer files are not in the index + nonExistTransformerFileCount := ExistInIndex(idx, transformerFiles, "transformer file ") + // check whether any of the transformer dirs are not in the index + nonExistTransformerDirCount := ExistInIndex(idx, transformerDirs, "transformer dir ") + + GitRepositorySummary(generatorFiles, "generator files") + GitRepositorySummary(generatorDirs, "generator dirs") + GitRepositorySummary(transformerFiles, "transformer files") + GitRepositorySummary(transformerDirs, "transformer dirs") + + fmt.Printf(`%d kustomization files use generators: %d generators are files and %d generators are dirs. +%d kustomization files use tranformers: %d transformers are files and %d transformers are dirs.`, + docUsingGeneratorCount, fileGeneratorCount, dirGeneratorCount, + docUsingTransformerCount, fileTransformerCount, dirTransformerCount) + fmt.Printf("\n") + fmt.Printf("%d generator files do not exist in the index\n", nonExistGeneratorFileCount) + fmt.Printf("%d generator dirs do not exist in the index\n", nonExistGeneratorDirCount) + fmt.Printf("%d transformer files do not exist in the index\n", nonExistTransformerFileCount) + fmt.Printf("%d transformer dirs do not exist in the index\n", nonExistTransformerDirCount) +} + +// GitRepositorySummary counts the distribution of docs: +// 1) how many git repositories are these docs from? +// 2) how many docs are from each git repository? +func GitRepositorySummary(docs []*doc.Document, msgPrefix string) { + m := make(map[string]int) + for _, d := range docs { + if _, ok := m[d.RepositoryURL]; ok { + m[d.RepositoryURL]++ + } else { + m[d.RepositoryURL] = 1 + } + } + sortedKeys := SortMapKeyByValue(m) + for _, k := range sortedKeys { + fmt.Printf("%d %s are from %s\n", m[k], msgPrefix, k) + } +} + +// ExistInIndex goes through each Document in docs, and check whether it is in the index or not. +// It returns the number of documents which does not exist in the index. +func ExistInIndex(idx *index.KustomizeIndex, docs []*doc.Document, msgPrefix string) int { + nonExistCount := 0 + for _, d := range docs { + exists, err := idx.Exists(d.ID()) + if err != nil { + log.Println(err) + } + if !exists { + log.Printf("%s (%s) does not exist in the index", msgPrefix, d.Path()) + nonExistCount++ + } + } + return nonExistCount +} + +// DocumentTypeSummary goes through each doc in docs, and determines whether it is a file or dir. +func DocumentTypeSummary(ctx context.Context, docs []*doc.Document) ( + fileCount, dirCount int, files, dirs []*doc.Document) { + githubToken := os.Getenv(githubAccessTokenVar) + if githubToken == "" { + log.Fatalf("Must set the variable '%s' to make github requests.\n", + githubAccessTokenVar) + } + ghCrawler := github.NewCrawler(githubToken, retryCount, &http.Client{}, github.QueryWith()) + + for _, d := range docs { + oldFilePath := d.FilePath + if err := ghCrawler.FetchDocument(ctx, d); err != nil { + log.Printf("FetchDocument failed on %s: %v", d.Path(), err) + continue + } + + if d.FilePath == oldFilePath { + fileCount++ + files = append(files, d) + } else { + dirCount++ + dirs = append(dirs, d) + } + } + return fileCount, dirCount, files, dirs +} + +// ExistInSlice checks where target exits in items. +func ExistInSlice(items []string, target string) bool { + for _, item := range items { + if item == target { + return true + } + } + return false +} + func main() { topKindsPtr := flag.Int( "kinds", -1, @@ -53,10 +216,12 @@ If you only want to list the 10 most popular identifiers, set the flag to 10.`) `the number of kustomize features to be listed according to their popularities. By default, all the features will be listed. If you only want to list the 10 most popular features, set the flag to 10.`) + indexNamePtr := flag.String( + "index", "kustomize", "The name of the ElasticSearch index.") flag.Parse() ctx := context.Background() - idx, err := index.NewKustomizeIndex(ctx) + idx, err := index.NewKustomizeIndex(ctx, *indexNamePtr) if err != nil { log.Fatalf("Could not create an index: %v\n", err) } @@ -74,6 +239,12 @@ If you only want to list the 10 most popular features, set the flag to 10.`) // ids tracks the unique IDs of the documents in the index ids := make(map[string]struct{}) + // generatorDocs includes all the docs using generators + generatorDocs := make([]*doc.Document, 0) + + // transformersDocs includes all the docs using transformers + transformersDocs := make([]*doc.Document, 0) + // get all the documents in the index query := []byte(`{ "query":{ "match_all":{} } }`) it := idx.IterateQuery(query, 10000, 60*time.Second) @@ -83,7 +254,7 @@ If you only want to list the 10 most popular features, set the flag to 10.`) if _, ok := ids[hit.ID]; !ok { ids[hit.ID] = struct{}{} } else { - fmt.Printf("Found duplicate ID (%s)\n", hit.ID) + log.Printf("Found duplicate ID (%s)\n", hit.ID) } count++ @@ -93,11 +264,18 @@ If you only want to list the 10 most popular features, set the flag to 10.`) if doc.IsKustomizationFile(hit.Document.FilePath) { kustomizationFilecount++ iterateArr(hit.Document.Identifiers, kustomizeIdentifiersMap) + if ExistInSlice(hit.Document.Identifiers, "generators") { + generatorDocs = append(generatorDocs, hit.Document.Copy()) + } + if ExistInSlice(hit.Document.Identifiers, "transformers") { + transformersDocs = append(transformersDocs, hit.Document.Copy()) + } } } } + if err := it.Err(); err != nil { - fmt.Printf("Error iterating: %v\n", err) + log.Fatalf("Error iterating: %v\n", err) } sortedKindsMapKeys := SortMapKeyByValue(kindsMap) @@ -136,4 +314,6 @@ There are %d documents in the kustomize index. kustomizeFeatureCount++ } } + + GeneratorAndTransformerStats(ctx, generatorDocs, transformersDocs, idx) } diff --git a/api/internal/crawl/config/crawler/kustomize_stats/job.yaml b/api/internal/crawl/config/crawler/kustomize_stats/job.yaml index e7d22cd51..15ec4c4fd 100644 --- a/api/internal/crawl/config/crawler/kustomize_stats/job.yaml +++ b/api/internal/crawl/config/crawler/kustomize_stats/job.yaml @@ -13,6 +13,11 @@ spec: command: ["/kustomize_stats"] args: ["--kinds=50", "--identifiers=50", "--kustomize-features=50"] env: + - name: GITHUB_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: github-access-token + key: token - name: ELASTICSEARCH_URL valueFrom: configMapKeyRef: diff --git a/api/internal/crawl/doc/docname.go b/api/internal/crawl/doc/docname.go index 5afca1290..e295e4620 100644 --- a/api/internal/crawl/doc/docname.go +++ b/api/internal/crawl/doc/docname.go @@ -35,6 +35,11 @@ func (doc *Document) Copy() *Document { } } +func (doc *Document) Path() string { + return fmt.Sprintf("repoURL: %s filePath: %s branch: %s", + doc.RepositoryURL, doc.FilePath, doc.DefaultBranch) +} + // Implements the CrawlerDocument interface. func (doc *Document) WasCached() bool { return doc.IsSame