mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-29 09:40:49 +00:00
Merge pull request #2172 from haiyanmeng/stats
Several improvements on crawler
This commit is contained in:
@@ -167,7 +167,7 @@ func main() {
|
||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
seedDocs = append(seedDocs, hit.Document.Copy())
|
||||
seedDocs = append(seedDocs, hit.Document.Document.Copy())
|
||||
}
|
||||
}
|
||||
if err := it.Err(); err != nil {
|
||||
|
||||
@@ -2,16 +2,13 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
|
||||
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
|
||||
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/index"
|
||||
@@ -34,9 +31,9 @@ func iterateArr(arr []string, countMap map[string]int) {
|
||||
|
||||
}
|
||||
|
||||
// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
|
||||
// SortMapKeyByValueInt takes a map as its input, sorts its keys according to their values
|
||||
// in the map, and outputs the sorted keys as a slice.
|
||||
func SortMapKeyByValue(m map[string]int) []string {
|
||||
func SortMapKeyByValueInt(m map[string]int) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
@@ -46,56 +43,58 @@ func SortMapKeyByValue(m map[string]int) []string {
|
||||
return keys
|
||||
}
|
||||
|
||||
func GeneratorOrTransformerStats(ctx context.Context,
|
||||
docs []*doc.Document, isGenerator bool, idx *index.KustomizeIndex) {
|
||||
// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
|
||||
// in the map, and outputs the sorted keys as a slice.
|
||||
func SortMapKeyByValueLen(m map[string][]string) []string {
|
||||
keys := make([]string, 0, len(m))
|
||||
for key := range m {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
// sort keys according to their values in the map m
|
||||
sort.Slice(keys, func(i, j int) bool { return len(m[keys[i]]) > len(m[keys[j]]) })
|
||||
return keys
|
||||
}
|
||||
|
||||
fieldName := "generators"
|
||||
if !isGenerator {
|
||||
fieldName = "transformers"
|
||||
func GeneratorOrTransformerStats(docs []*doc.KustomizationDocument) {
|
||||
n := len(docs)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// allReferredDocs includes all the documents referred in the field
|
||||
allReferredDocs := doc.NewUniqueDocuments()
|
||||
fileType := docs[0].FileType
|
||||
fmt.Printf("There are totally %d %s files.\n", n, fileType)
|
||||
|
||||
// docUsingGeneratorCount counts the number of the kustomization files using generators or transformers
|
||||
docCount := 0
|
||||
GitRepositorySummary(docs, fileType)
|
||||
|
||||
// key of kindToUrls: a string in the KustomizationDocument.Kinds field
|
||||
// value of kindToUrls: a slice of string urls defining a given kind.
|
||||
kindToUrls := make(map[string][]string)
|
||||
|
||||
// collect all the documents referred in the field
|
||||
for _, d := range docs {
|
||||
kdoc := doc.KustomizationDocument{
|
||||
Document: *d,
|
||||
}
|
||||
referredDocs, err := kdoc.GetResources(false, !isGenerator, isGenerator)
|
||||
if err != nil {
|
||||
log.Printf("failed to parse the %s field of the Document (%s): %v",
|
||||
fieldName, d.Path(), err)
|
||||
}
|
||||
if len(referredDocs) > 0 {
|
||||
docCount++
|
||||
allReferredDocs.AddDocuments(referredDocs)
|
||||
url := fmt.Sprintf("%s/blob/%s/%s", d.RepositoryURL, d.DefaultBranch, d.FilePath)
|
||||
for _, kind := range d.Kinds {
|
||||
if _, ok := kindToUrls[kind]; !ok {
|
||||
kindToUrls[kind] = []string{url}
|
||||
} else {
|
||||
kindToUrls[kind] = append(kindToUrls[kind], url)
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Printf("There are totally %d kinds of %s\n", len(kindToUrls), fileType)
|
||||
sortedKeys := SortMapKeyByValueLen(kindToUrls)
|
||||
for _, k := range sortedKeys {
|
||||
sort.Strings(kindToUrls[k])
|
||||
fmt.Printf("%s kind %s appears %d times\n", fileType, k, len(kindToUrls[k]))
|
||||
for _, url := range kindToUrls[k] {
|
||||
fmt.Printf("%s\n", url)
|
||||
}
|
||||
}
|
||||
|
||||
fileCount, dirCount, fileTypeDocs, dirTypeDocs := DocumentTypeSummary(ctx, allReferredDocs.Documents())
|
||||
|
||||
// check whether any of the files are not in the index
|
||||
nonExistFileCount := ExistInIndex(idx, fileTypeDocs, fieldName + " file ")
|
||||
// check whether any of the dirs are not in the index
|
||||
nonExistDirCount := ExistInIndex(idx, dirTypeDocs, fieldName + " dir ")
|
||||
|
||||
GitRepositorySummary(fileTypeDocs, fieldName + " files")
|
||||
GitRepositorySummary(dirTypeDocs, fieldName + " dirs")
|
||||
|
||||
fmt.Printf("%d kustomization files use %s: %d %s are files and %d %s are dirs.\n",
|
||||
docCount, fieldName, fileCount, fieldName, dirCount, fieldName)
|
||||
fmt.Printf("%d %s files do not exist in the index\n", nonExistFileCount, fieldName)
|
||||
fmt.Printf("%d %s dirs do not exist in the index\n", nonExistDirCount, fieldName)
|
||||
}
|
||||
|
||||
// GitRepositorySummary counts the distribution of docs:
|
||||
// 1) how many git repositories are these docs from?
|
||||
// 2) how many docs are from each git repository?
|
||||
func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
|
||||
func GitRepositorySummary(docs []*doc.KustomizationDocument, fileType string) {
|
||||
m := make(map[string]int)
|
||||
for _, d := range docs {
|
||||
if _, ok := m[d.RepositoryURL]; ok {
|
||||
@@ -104,65 +103,16 @@ func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
|
||||
m[d.RepositoryURL] = 1
|
||||
}
|
||||
}
|
||||
sortedKeys := SortMapKeyByValue(m)
|
||||
sortedKeys := SortMapKeyByValueInt(m)
|
||||
topN := 10
|
||||
i := 0
|
||||
for _, k := range sortedKeys {
|
||||
fmt.Printf("%d %s are from %s\n", m[k], msgPrefix, k)
|
||||
}
|
||||
}
|
||||
|
||||
// ExistInIndex goes through each Document in docs, and check whether it is in the index or not.
|
||||
// It returns the number of documents which does not exist in the index.
|
||||
func ExistInIndex(idx *index.KustomizeIndex, docs []*doc.Document, msgPrefix string) int {
|
||||
nonExistCount := 0
|
||||
for _, d := range docs {
|
||||
exists, err := idx.Exists(d.ID())
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
if !exists {
|
||||
log.Printf("%s (%s) does not exist in the index", msgPrefix, d.Path())
|
||||
nonExistCount++
|
||||
if i >= topN {
|
||||
break
|
||||
}
|
||||
fmt.Printf("%d %s are from %s\n", m[k], fileType, k)
|
||||
i++
|
||||
}
|
||||
return nonExistCount
|
||||
}
|
||||
|
||||
// DocumentTypeSummary goes through each doc in docs, and determines whether it is a file or dir.
|
||||
func DocumentTypeSummary(ctx context.Context, docs []*doc.Document) (
|
||||
fileCount, dirCount int, files, dirs []*doc.Document) {
|
||||
githubToken := os.Getenv(githubAccessTokenVar)
|
||||
if githubToken == "" {
|
||||
log.Fatalf("Must set the variable '%s' to make github requests.\n",
|
||||
githubAccessTokenVar)
|
||||
}
|
||||
ghCrawler := github.NewCrawler(githubToken, retryCount, &http.Client{}, github.QueryWith())
|
||||
|
||||
for _, d := range docs {
|
||||
oldFilePath := d.FilePath
|
||||
if err := ghCrawler.FetchDocument(ctx, d); err != nil {
|
||||
log.Printf("FetchDocument failed on %s: %v", d.Path(), err)
|
||||
continue
|
||||
}
|
||||
|
||||
if d.FilePath == oldFilePath {
|
||||
fileCount++
|
||||
files = append(files, d)
|
||||
} else {
|
||||
dirCount++
|
||||
dirs = append(dirs, d)
|
||||
}
|
||||
}
|
||||
return fileCount, dirCount, files, dirs
|
||||
}
|
||||
|
||||
// ExistInSlice checks where target exits in items.
|
||||
func ExistInSlice(items []string, target string) bool {
|
||||
for _, item := range items {
|
||||
if item == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func main() {
|
||||
@@ -204,17 +154,26 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
|
||||
// ids tracks the unique IDs of the documents in the index
|
||||
ids := make(map[string]struct{})
|
||||
|
||||
// generatorDocs includes all the docs using generators
|
||||
generatorDocs := make([]*doc.Document, 0)
|
||||
// generatorFiles include all the non-kustomization files whose FileType is generator
|
||||
generatorFiles := make([]*doc.KustomizationDocument, 0)
|
||||
|
||||
// transformersDocs includes all the docs using transformers
|
||||
transformersDocs := make([]*doc.Document, 0)
|
||||
// transformersFiles include all the non-kustomization files whose FileType is transformer
|
||||
transformersFiles := make([]*doc.KustomizationDocument, 0)
|
||||
|
||||
checksums := make(map[string]int)
|
||||
|
||||
// get all the documents in the index
|
||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
sum := fmt.Sprintf("%x", sha256.Sum256([]byte(hit.Document.DocumentData)))
|
||||
if _, ok := checksums[sum]; ok {
|
||||
checksums[sum]++
|
||||
} else {
|
||||
checksums[sum] = 1
|
||||
}
|
||||
|
||||
// check whether there is any duplicate IDs in the index
|
||||
if _, ok := ids[hit.ID]; !ok {
|
||||
ids[hit.ID] = struct{}{}
|
||||
@@ -229,11 +188,13 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
|
||||
if doc.IsKustomizationFile(hit.Document.FilePath) {
|
||||
kustomizationFilecount++
|
||||
iterateArr(hit.Document.Identifiers, kustomizeIdentifiersMap)
|
||||
if ExistInSlice(hit.Document.Identifiers, "generators") {
|
||||
generatorDocs = append(generatorDocs, hit.Document.Copy())
|
||||
}
|
||||
if ExistInSlice(hit.Document.Identifiers, "transformers") {
|
||||
transformersDocs = append(transformersDocs, hit.Document.Copy())
|
||||
|
||||
} else {
|
||||
switch hit.Document.FileType {
|
||||
case "generator":
|
||||
generatorFiles = append(generatorFiles, hit.Document.Copy())
|
||||
case "transformer":
|
||||
transformersFiles = append(transformersFiles, hit.Document.Copy())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -243,9 +204,9 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
|
||||
log.Fatalf("Error iterating: %v\n", err)
|
||||
}
|
||||
|
||||
sortedKindsMapKeys := SortMapKeyByValue(kindsMap)
|
||||
sortedIdentifiersMapKeys := SortMapKeyByValue(identifiersMap)
|
||||
sortedKustomizeIdentifiersMapKeys := SortMapKeyByValue(kustomizeIdentifiersMap)
|
||||
sortedKindsMapKeys := SortMapKeyByValueInt(kindsMap)
|
||||
sortedIdentifiersMapKeys := SortMapKeyByValueInt(identifiersMap)
|
||||
sortedKustomizeIdentifiersMapKeys := SortMapKeyByValueInt(kustomizeIdentifiersMap)
|
||||
|
||||
fmt.Printf(`The count of unique document IDs in the kustomize index: %d
|
||||
There are %d documents in the kustomize index.
|
||||
@@ -280,6 +241,14 @@ There are %d documents in the kustomize index.
|
||||
}
|
||||
}
|
||||
|
||||
GeneratorOrTransformerStats(ctx, generatorDocs, true, idx)
|
||||
GeneratorOrTransformerStats(ctx, transformersDocs, false, idx)
|
||||
GeneratorOrTransformerStats(generatorFiles)
|
||||
GeneratorOrTransformerStats(transformersFiles)
|
||||
|
||||
fmt.Printf("There are total %d checksums of document contents\n", len(checksums))
|
||||
sortedChecksums := SortMapKeyByValueInt(checksums)
|
||||
sortedChecksums = sortedChecksums[:20]
|
||||
fmt.Printf("The top 20 checksums are:\n")
|
||||
for _, key := range sortedChecksums {
|
||||
fmt.Printf("checksum %s apprears %d\n", key, checksums[key])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: crawler
|
||||
name: crawler-cronjob
|
||||
spec:
|
||||
schedule: "5 0 * * */1"
|
||||
# run the cronjob at 00:00 every 7 days
|
||||
schedule: "0 0 */7 * *"
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
@@ -11,7 +12,9 @@ spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: crawler
|
||||
image: gcr.io/kustomize-search/crawler:latest
|
||||
image: gcr.io/haiyanmeng-gke-dev/crawler:v1
|
||||
command: ["/crawler"]
|
||||
args: ["--mode=index+github", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"]
|
||||
imagePullPolicy: Always
|
||||
env:
|
||||
- name: GITHUB_ACCESS_TOKEN
|
||||
|
||||
16
api/internal/crawl/config/elastic/esbackup.yaml
Normal file
16
api/internal/crawl/config/elastic/esbackup.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
# Creating `esbackup/kustomize-backbup` will create the `kustomize-backup` snapshot repository.
|
||||
# Deleting `esbackup/kustomize-backbup` will delete the `kustomize-backup` snapshot repository and all the snapshots in the repository.
|
||||
# Deleting `esbackup/kustomize-backbup` will NOT delete essnapshot and esrestore objects.
|
||||
apiVersion: elasticsearch.cloud.google.com/v1alpha1
|
||||
kind: ESBackup
|
||||
metadata:
|
||||
name: kustomize-backup
|
||||
spec:
|
||||
storage:
|
||||
gcs:
|
||||
bucket: kustomize-backup
|
||||
path: kustomize
|
||||
secret:
|
||||
name: kustomizesa
|
||||
escluster:
|
||||
name: esbasic
|
||||
@@ -8,6 +8,13 @@ spec:
|
||||
- repository-gcs
|
||||
- ingest-user-agent
|
||||
- ingest-geoip
|
||||
# To set `gcpserviceaccount`,
|
||||
# First, create and download a GCP service account into a json file, named `sakey.json` following the instruction:
|
||||
# https://www.elastic.co/guide/en/elasticsearch/plugins/6.5/repository-gcs-usage.html#repository-gcs-using-service-account
|
||||
# Second, create a secret for the service account using the following command:
|
||||
# $ kubectl create secret generic kustomizesa --from-file=./sakey.json
|
||||
gcpserviceaccount:
|
||||
name: kustomizesa
|
||||
config:
|
||||
env:
|
||||
example: test
|
||||
|
||||
16
api/internal/crawl/config/elastic/esrestore.yaml
Normal file
16
api/internal/crawl/config/elastic/esrestore.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
# Creating `esrestore/kustomize-restore` will restore the `kuostmize` index in the `kustomize-snapshot` snapshot to a new index named `kusotmize-restore`.
|
||||
# Deleting `esrestore/kustomize-restore` will not delete the restored index.
|
||||
# Deleting `esrestore/kustomize-restore` should happen before deleting `essnapshot/kustomize-snapshot`.
|
||||
apiVersion: elasticsearch.cloud.google.com/v1alpha1
|
||||
kind: ESRestore
|
||||
metadata:
|
||||
name: kustomize-restore
|
||||
spec:
|
||||
include_global_state: true
|
||||
ignore_unavailable: true
|
||||
rename_pattern: kustomize
|
||||
rename_replacement: kustomize-restore
|
||||
essnapshot:
|
||||
name: kustomize-snapshot
|
||||
escluster:
|
||||
name: esbasic
|
||||
15
api/internal/crawl/config/elastic/essnaptshot.yaml
Normal file
15
api/internal/crawl/config/elastic/essnaptshot.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
# Creating `essnapshot/kustomize-snapshot` will create a snapshot named `kustomize-snapshot` in the `kustomize-backup` snapshot repository.
|
||||
# Deleting `essnapshot/kustomize-snapshot` will delete the snapshot.
|
||||
# Deleting `essnapshot/kustomize-snapshot` should happen before deleting `esbackup/kustomize-backup`.
|
||||
apiVersion: elasticsearch.cloud.google.com/v1alpha1
|
||||
kind: ESSnapshot
|
||||
metadata:
|
||||
name: kustomize-snapshot
|
||||
spec:
|
||||
# indices are optional. If not specified all indices are selected.
|
||||
indices:
|
||||
- kustomize
|
||||
include_global_state: true
|
||||
ignore_unavailable: true
|
||||
esbackup:
|
||||
name: kustomize-backup
|
||||
@@ -80,6 +80,36 @@ func (gc githubCrawler) DefaultBranch(repo string) string {
|
||||
func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
output chan<- crawler.CrawledDocument, seen utils.SeenMap) error {
|
||||
|
||||
ranges := []RangeWithin{
|
||||
RangeWithin{
|
||||
start: uint64(0),
|
||||
end: githubMaxFileSize,
|
||||
},
|
||||
}
|
||||
|
||||
errs := make(multiError, 0)
|
||||
for len(ranges) > 0 {
|
||||
tailRange := ranges[len(ranges) - 1]
|
||||
ranges = ranges[:(len(ranges) - 1)]
|
||||
reProcessQueryRanges, err := gc.CrawlSingleRange(ctx, output, seen, tailRange.start, tailRange.end)
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
ranges = append(ranges, reProcessQueryRanges...)
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gc githubCrawler) CrawlSingleRange(ctx context.Context,
|
||||
output chan<- crawler.CrawledDocument, seen utils.SeenMap,
|
||||
lowerBound, upperBound uint64) ([]RangeWithin, error) {
|
||||
|
||||
log.Printf("CrawlSingleRange [%d, %d]", lowerBound, upperBound)
|
||||
|
||||
noETagClient := GhClient{
|
||||
RequestConfig: gc.client.RequestConfig,
|
||||
client: &http.Client{Timeout: gc.client.client.Timeout},
|
||||
@@ -87,13 +117,16 @@ func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
accessToken: gc.client.accessToken,
|
||||
}
|
||||
|
||||
var reProcessQueryRanges []RangeWithin
|
||||
|
||||
var ranges []string
|
||||
var err error
|
||||
// Since Github returns a max of 1000 results per query, we can use
|
||||
// multiple queries that split the search space into chunks of at most
|
||||
// 1000 files to get all of the data.
|
||||
for i := 0; i < 5; i++ {
|
||||
ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query))
|
||||
ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query),
|
||||
lowerBound, upperBound)
|
||||
if err == nil {
|
||||
logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i)
|
||||
break
|
||||
@@ -102,7 +135,7 @@ func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not split %v into ranges, %v\n",
|
||||
return reProcessQueryRanges, fmt.Errorf("could not split %v into ranges, %v\n",
|
||||
gc.query, err)
|
||||
}
|
||||
|
||||
@@ -112,20 +145,23 @@ func (gc githubCrawler) Crawl(ctx context.Context,
|
||||
errs := make(multiError, 0)
|
||||
queryResult := RangeQueryResult{}
|
||||
for _, query := range ranges {
|
||||
rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap)
|
||||
reProcessQuery, rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap)
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
queryResult.Add(rangeResult)
|
||||
if reProcessQuery {
|
||||
reProcessQueryRanges = append(reProcessQueryRanges, RangeSizes(query))
|
||||
}
|
||||
}
|
||||
|
||||
logger.Printf("Summary of Crawl: %s", queryResult.String())
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
return reProcessQueryRanges, errs
|
||||
}
|
||||
|
||||
return nil
|
||||
return reProcessQueryRanges, nil
|
||||
}
|
||||
|
||||
// FetchDocument first tries to fetch the document with d.FilePath. If it fails,
|
||||
@@ -225,7 +261,7 @@ func (r *RangeQueryResult) String() string {
|
||||
// documents from the crawl to the datastore/index.
|
||||
func processQuery(ctx context.Context, gcl GhClient, query string,
|
||||
output chan<- crawler.CrawledDocument, seen utils.SeenMap,
|
||||
branchMap map[string]string) (RangeQueryResult, error) {
|
||||
branchMap map[string]string) (bool, RangeQueryResult, error) {
|
||||
|
||||
queryPages := make(chan GhResponseInfo)
|
||||
|
||||
@@ -241,6 +277,8 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
|
||||
close(queryPages)
|
||||
}()
|
||||
|
||||
reProcessQuery := false
|
||||
|
||||
errs := make(multiError, 0)
|
||||
result := RangeQueryResult{}
|
||||
pageID := 1
|
||||
@@ -271,11 +309,15 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
|
||||
result.Add(pageResult)
|
||||
|
||||
pageID++
|
||||
|
||||
if page.Parsed.TotalCount > githubMaxResultsPerQuery {
|
||||
reProcessQuery = true
|
||||
}
|
||||
}
|
||||
|
||||
logger.Printf("Summary of processQuery: %s", result.String())
|
||||
|
||||
return result, errs
|
||||
return reProcessQuery, result, errs
|
||||
}
|
||||
|
||||
func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
|
||||
@@ -337,7 +379,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string,
|
||||
output chan<- GhResponseInfo) error {
|
||||
|
||||
logger.Println("querying: ", query)
|
||||
response := gcl.parseGithubResponse(query)
|
||||
response := gcl.parseGithubResponseWithRetry(query)
|
||||
|
||||
if response.Error != nil {
|
||||
return response.Error
|
||||
@@ -350,7 +392,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string,
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
default:
|
||||
response = gcl.parseGithubResponse(response.NextURL)
|
||||
response = gcl.parseGithubResponseWithRetry(response.NextURL)
|
||||
if response.Error != nil {
|
||||
return response.Error
|
||||
}
|
||||
@@ -545,6 +587,8 @@ type githubResponse struct {
|
||||
// This is the number of files that match the query.
|
||||
TotalCount uint64 `json:"total_count,omitempty"`
|
||||
|
||||
IncompleteResults bool `json:"incomplete_results,omitempty"`
|
||||
|
||||
// Github representation of a file.
|
||||
Items []GhFileSpec `json:"items,omitempty"`
|
||||
}
|
||||
@@ -587,6 +631,17 @@ func parseGithubLinkFormat(links string) (string, string) {
|
||||
return next, last
|
||||
}
|
||||
|
||||
func (gcl GhClient) parseGithubResponseWithRetry(getRequest string) GhResponseInfo {
|
||||
resp := gcl.parseGithubResponse(getRequest)
|
||||
retries := 0
|
||||
for resp.Parsed.IncompleteResults {
|
||||
resp = gcl.parseGithubResponse(getRequest)
|
||||
retries++
|
||||
}
|
||||
log.Printf("The result of query(%s) is complete after %d retries", getRequest, retries)
|
||||
return resp
|
||||
}
|
||||
|
||||
func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo {
|
||||
resp, err := gcl.SearchGithubAPI(getRequest)
|
||||
requestInfo := GhResponseInfo{
|
||||
|
||||
@@ -100,6 +100,8 @@ package github
|
||||
import (
|
||||
"fmt"
|
||||
"math/bits"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Files cannot be more than 2^19 bytes, according to
|
||||
@@ -112,7 +114,7 @@ const (
|
||||
// Interface instead of struct for testing purposes.
|
||||
// Not expecting to have multiple implementations.
|
||||
type cachedSearch interface {
|
||||
CountResults(uint64) (uint64, error)
|
||||
CountResults(uint64, uint64) (uint64, error)
|
||||
RequestString(filesize rangeFormatter) string
|
||||
}
|
||||
|
||||
@@ -161,16 +163,16 @@ func newCache(client GhClient, query Query) githubCachedSearch {
|
||||
}
|
||||
}
|
||||
|
||||
func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
func (c githubCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) {
|
||||
count, cached := c.cache[upperBound]
|
||||
if cached {
|
||||
return count, nil
|
||||
}
|
||||
|
||||
sizeRange := RangeWithin{0, upperBound}
|
||||
sizeRange := RangeWithin{lowerBound, upperBound}
|
||||
rangeRequest := c.RequestString(sizeRange)
|
||||
|
||||
result := c.gcl.parseGithubResponse(rangeRequest)
|
||||
result := c.gcl.parseGithubResponseWithRetry(rangeRequest)
|
||||
if result.Error != nil {
|
||||
return count, result.Error
|
||||
}
|
||||
@@ -204,7 +206,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
"Retrying query... current lower bound: %d, got: %d\n",
|
||||
c.cache[prev], result.Parsed.TotalCount)
|
||||
|
||||
result = c.gcl.parseGithubResponse(rangeRequest)
|
||||
result = c.gcl.parseGithubResponseWithRetry(rangeRequest)
|
||||
if result.Error != nil {
|
||||
return count, result.Error
|
||||
}
|
||||
@@ -219,8 +221,8 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
}
|
||||
|
||||
count = result.Parsed.TotalCount
|
||||
logger.Printf("Caching new query %s, with count %d\n",
|
||||
sizeRange.RangeString(), count)
|
||||
logger.Printf("Caching new query %s, with count %d (incomplete_results: %v)\n",
|
||||
sizeRange.RangeString(), count, result.Parsed.IncompleteResults)
|
||||
c.cache[upperBound] = count
|
||||
return count, nil
|
||||
}
|
||||
@@ -238,8 +240,8 @@ func (c githubCachedSearch) RequestString(filesize rangeFormatter) string {
|
||||
// This would mean that the search as it is could not find all files. If queries
|
||||
// are sorted by last indexed, and retrieved on regular intervals, it should be
|
||||
// sufficient to get most if not all documents.
|
||||
func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
totalFiles, err := cache.CountResults(githubMaxFileSize)
|
||||
func FindRangesForRepoSearch(cache cachedSearch, lowerBound, upperBound uint64) ([]string, error) {
|
||||
totalFiles, err := cache.CountResults(lowerBound, upperBound)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -247,7 +249,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
|
||||
if githubMaxResultsPerQuery >= totalFiles {
|
||||
return []string{
|
||||
cache.RequestString(RangeWithin{0, githubMaxFileSize}),
|
||||
cache.RequestString(RangeWithin{lowerBound, upperBound}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -275,6 +277,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
// range.
|
||||
filesAccessible := uint64(0)
|
||||
sizes := make([]uint64, 0)
|
||||
sizes = append(sizes, lowerBound)
|
||||
for filesAccessible < totalFiles {
|
||||
target := filesAccessible + githubMaxResultsPerQuery
|
||||
if target >= totalFiles {
|
||||
@@ -284,22 +287,22 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
logger.Printf("%d accessible files, next target = %d\n",
|
||||
filesAccessible, target)
|
||||
|
||||
cur, err := lowerBoundFileCount(cache, target)
|
||||
size, err := FindFileSize(cache, target, lowerBound, upperBound)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// If there are more than 1000 files in the next bucket, we must
|
||||
// advance anyway and lose out on some files :(.
|
||||
if l := len(sizes); l > 0 && sizes[l-1] == cur {
|
||||
cur++
|
||||
if l := len(sizes); l > 0 && sizes[l-1] == size {
|
||||
size++
|
||||
}
|
||||
|
||||
nextAccessible, err := cache.CountResults(cur)
|
||||
nextAccessible, err := cache.CountResults(lowerBound, size)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(
|
||||
"cache should be populated at %d already, got %v",
|
||||
cur, err)
|
||||
size, err)
|
||||
}
|
||||
if nextAccessible < filesAccessible {
|
||||
return nil, fmt.Errorf(
|
||||
@@ -309,31 +312,31 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
|
||||
filesAccessible = nextAccessible
|
||||
if nextAccessible < totalFiles {
|
||||
sizes = append(sizes, cur)
|
||||
sizes = append(sizes, size)
|
||||
}
|
||||
}
|
||||
|
||||
sizes = append(sizes, upperBound)
|
||||
return formatFilesizeRanges(cache, sizes), nil
|
||||
}
|
||||
|
||||
// lowerBoundFileCount finds the filesize range from [0, return value] that has
|
||||
// FindFileSize finds the filesize range from [lowerBound, return value] that has
|
||||
// the largest file count that is smaller than or equal to
|
||||
// githubMaxResultsPerQuery. It is important to note that this returned value
|
||||
// could already be in a previous range if the next file size has more than 1000
|
||||
// results. It is left to the caller to handle this bit of logic and guarantee
|
||||
// forward progession in this case.
|
||||
func lowerBoundFileCount(
|
||||
cache cachedSearch, targetFileCount uint64) (uint64, error) {
|
||||
func FindFileSize(
|
||||
cache cachedSearch, targetFileCount, lowerBound, upperBound uint64) (uint64, error) {
|
||||
|
||||
// Binary search for file sizes that make up the next <=1000 element
|
||||
// chunk.
|
||||
cur := uint64(0)
|
||||
increase := githubMaxFileSize / 2
|
||||
cur := lowerBound
|
||||
increase := (upperBound - lowerBound) / 2
|
||||
|
||||
for increase > 0 {
|
||||
mid := cur + increase
|
||||
|
||||
count, err := cache.CountResults(mid)
|
||||
count, err := cache.CountResults(lowerBound, mid)
|
||||
if err != nil {
|
||||
return count, err
|
||||
}
|
||||
@@ -353,26 +356,24 @@ func lowerBoundFileCount(
|
||||
}
|
||||
|
||||
func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string {
|
||||
ranges := make([]string, 0, len(sizes)+1)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeLessThan{sizes[0] + 1},
|
||||
))
|
||||
n := len(sizes)
|
||||
if n < 2 {
|
||||
return []string{}
|
||||
}
|
||||
|
||||
for i := 0; i < len(sizes)-1; i += 1 {
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeWithin{sizes[i] + 1, sizes[i+1]},
|
||||
))
|
||||
|
||||
if i != len(sizes)-2 {
|
||||
continue
|
||||
}
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeGreaterThan{sizes[i+1]},
|
||||
))
|
||||
ranges := make([]string, 0, n-1)
|
||||
ranges = append(ranges, cache.RequestString(RangeWithin{sizes[0], sizes[1]}))
|
||||
for i := 1; i < n-1; i++ {
|
||||
ranges = append(ranges, cache.RequestString(RangeWithin{sizes[i] + 1, sizes[i+1]}))
|
||||
}
|
||||
|
||||
return ranges
|
||||
}
|
||||
|
||||
func RangeSizes(s string) RangeWithin {
|
||||
start := strings.Index(s, "+size:") + len("+size:")
|
||||
end := strings.Index(s, "&")
|
||||
ranges := strings.Split(s[start:end], "..")
|
||||
lowerBound, _ := strconv.ParseUint(ranges[0], 10, 64)
|
||||
upperBound, _ := strconv.ParseUint(ranges[1], 10, 64)
|
||||
return RangeWithin{lowerBound, upperBound}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ type testCachedSearch struct {
|
||||
cache map[uint64]uint64
|
||||
}
|
||||
|
||||
func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
func (c testCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) {
|
||||
log.Printf("CountResults(%05x)\n", upperBound)
|
||||
count, ok := c.cache[upperBound]
|
||||
if !ok {
|
||||
@@ -73,19 +73,29 @@ func TestRangeSplitting(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
requests, err := FindRangesForRepoSearch(cache)
|
||||
requests, err := FindRangesForRepoSearch(cache, 0, 524288)
|
||||
if err != nil {
|
||||
t.Errorf("Error while finding ranges: %v", err)
|
||||
}
|
||||
expected := []string{
|
||||
"<107", // cache.RequestString(RangeLessThan{0x6b}),
|
||||
"107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}),
|
||||
"129..256", // cache.RequestString(RangeWithin{0x81, 0x100}),
|
||||
"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
|
||||
">4095", // cache.RequestString(RangeGreaterThan{0xfff}),
|
||||
"0..106", // cache.RequestString(RangeWithin{0x00, 0x6a}),
|
||||
"107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}),
|
||||
"129..256", // cache.RequestString(RangeWithin{0x81, 0x100}),
|
||||
"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
|
||||
"4096..524288", // cache.RequestString(RangeWithin{0x1000, 0x80000}),
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(requests, expected) {
|
||||
t.Errorf("Expected requests (%v) to equal (%v)", requests, expected)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRangeSizes(t *testing.T) {
|
||||
s := "https://api.github.com/search/code?q=filename:kustomization.yaml+filename:kustomization.yml" +
|
||||
"+filename:kustomization+size:2365..10000&order=desc&per_page=100&sort=indexed"
|
||||
returnedResult := RangeSizes(s)
|
||||
expectedResult := RangeWithin{uint64(2365), uint64(10000)}
|
||||
if !reflect.DeepEqual(returnedResult, expectedResult) {
|
||||
t.Errorf("RangeSizes expected (%v), got (%v)",expectedResult, returnedResult)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,6 +46,15 @@ type KustomizationDocument struct {
|
||||
|
||||
type set map[string]struct{}
|
||||
|
||||
func (doc *KustomizationDocument) Copy() *KustomizationDocument {
|
||||
return &KustomizationDocument{
|
||||
Document: *(doc.Document.Copy()),
|
||||
Kinds: doc.Kinds,
|
||||
Identifiers: doc.Identifiers,
|
||||
Values: doc.Values,
|
||||
}
|
||||
}
|
||||
|
||||
func (doc *KustomizationDocument) String() string {
|
||||
return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v",
|
||||
doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime,
|
||||
|
||||
29
api/internal/crawl/search_cmds/generator.md
Normal file
29
api/internal/crawl/search_cmds/generator.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Find all the generator files whose `kinds` field includes `ChartRenderer`, and
|
||||
only output certain fields of each document:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
|
||||
{
|
||||
"size": 200,
|
||||
"_source": {
|
||||
"includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"]
|
||||
},
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [
|
||||
{ "regexp": { "fileType": "generator" }}
|
||||
],
|
||||
"must_not": {
|
||||
"regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }
|
||||
},
|
||||
"must": {
|
||||
"match" : {
|
||||
"kinds" : {
|
||||
"query" : "ChartRenderer"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'
|
||||
```
|
||||
29
api/internal/crawl/search_cmds/snapshot.md
Normal file
29
api/internal/crawl/search_cmds/snapshot.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Retrieve information about all registered snapshot repositories:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/_snapshot?pretty"
|
||||
```
|
||||
|
||||
Retrieve information about a given snapshot repository, `kustomize-backup`:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup?pretty"
|
||||
```
|
||||
|
||||
Verify a snapshot repository, `kustomize-backup`, manually:
|
||||
```
|
||||
curl -X POST "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/_verify?pretty"
|
||||
```
|
||||
|
||||
List all the snapshots in a given snapshot repository:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/_cat/snapshots/kustomize-backup?v&s=id&pretty"
|
||||
```
|
||||
|
||||
Retrieve a summary information about a given snapshot:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot?pretty"
|
||||
```
|
||||
|
||||
Retrieve a detailed information about a given snapshot:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot/_status?pretty"
|
||||
```
|
||||
29
api/internal/crawl/search_cmds/transformer.md
Normal file
29
api/internal/crawl/search_cmds/transformer.md
Normal file
@@ -0,0 +1,29 @@
|
||||
Find all the trasnformer files whose `kinds` field includes `HelmValues`, and
|
||||
only output certain fields of each document:
|
||||
```
|
||||
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
|
||||
{
|
||||
"size": 200,
|
||||
"_source": {
|
||||
"includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"]
|
||||
},
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [
|
||||
{ "regexp": { "fileType": "transformer" }}
|
||||
],
|
||||
"must_not": {
|
||||
"regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" }
|
||||
},
|
||||
"must": {
|
||||
"match" : {
|
||||
"kinds" : {
|
||||
"query" : "HelmValues"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'
|
||||
```
|
||||
Reference in New Issue
Block a user