Merge pull request #2172 from haiyanmeng/stats

Several improvements on crawler
2026-06-29 09:40:49 +00:00 · 2020-02-03 13:11:24 -08:00
parent c626eae9bd 3ebeebabde
commit c683e6ae3c
14 changed files with 362 additions and 174 deletions
--- a/api/internal/crawl/cmd/crawler/crawler.go
+++ b/api/internal/crawl/cmd/crawler/crawler.go
@@ -167,7 +167,7 @@ func main() {
 		it := idx.IterateQuery(query, 10000, 60*time.Second)
 		for it.Next() {
 			for _, hit := range it.Value().Hits.Hits {
-				seedDocs = append(seedDocs, hit.Document.Copy())
+				seedDocs = append(seedDocs, hit.Document.Document.Copy())
 			}
 		}
 		if err := it.Err(); err != nil {
--- a/api/internal/crawl/cmd/kustomize_stats/main.go
+++ b/api/internal/crawl/cmd/kustomize_stats/main.go
@@ -2,16 +2,13 @@ package main

 import (
 	"context"
+	"crypto/sha256"
 	"flag"
 	"fmt"
 	"log"
-	"net/http"
-	"os"
 	"sort"
 	"time"

-	"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
-
 	"sigs.k8s.io/kustomize/api/internal/crawl/doc"

 	"sigs.k8s.io/kustomize/api/internal/crawl/index"
@@ -34,9 +31,9 @@ func iterateArr(arr []string, countMap map[string]int) {

 }

-// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
+// SortMapKeyByValueInt takes a map as its input, sorts its keys according to their values
 // in the map, and outputs the sorted keys as a slice.
-func SortMapKeyByValue(m map[string]int) []string {
+func SortMapKeyByValueInt(m map[string]int) []string {
 	keys := make([]string, 0, len(m))
 	for key := range m {
 		keys = append(keys, key)
@@ -46,56 +43,58 @@ func SortMapKeyByValue(m map[string]int) []string {
 	return keys
 }

-func GeneratorOrTransformerStats(ctx context.Context,
-	docs []*doc.Document, isGenerator bool, idx *index.KustomizeIndex) {
+// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
+// in the map, and outputs the sorted keys as a slice.
+func SortMapKeyByValueLen(m map[string][]string) []string {
+	keys := make([]string, 0, len(m))
+	for key := range m {
+		keys = append(keys, key)
+	}
+	// sort keys according to their values in the map m
+	sort.Slice(keys, func(i, j int) bool { return len(m[keys[i]]) > len(m[keys[j]]) })
+	return keys
+}

-	fieldName := "generators"
-	if !isGenerator {
-		fieldName = "transformers"
+func GeneratorOrTransformerStats(docs []*doc.KustomizationDocument) {
+	n := len(docs)
+	if n == 0 {
+		return
 	}

-	// allReferredDocs includes all the documents referred in the field
-	allReferredDocs := doc.NewUniqueDocuments()
+	fileType := docs[0].FileType
+	fmt.Printf("There are totally %d %s files.\n", n, fileType)

-	// docUsingGeneratorCount counts the number of the kustomization files using generators or transformers
-	docCount := 0
+	GitRepositorySummary(docs, fileType)
+
+	// key of kindToUrls: a string in the KustomizationDocument.Kinds field
+	// value of kindToUrls: a slice of string urls defining a given kind.
+	kindToUrls := make(map[string][]string)

-	// collect all the documents referred in the field
 	for _, d := range docs {
-		kdoc := doc.KustomizationDocument{
-			Document: *d,
-		}
-		referredDocs, err := kdoc.GetResources(false, !isGenerator, isGenerator)
-		if err != nil {
-			log.Printf("failed to parse the %s field of the Document (%s): %v",
-				fieldName, d.Path(), err)
-		}
-		if len(referredDocs) > 0 {
-			docCount++
-			allReferredDocs.AddDocuments(referredDocs)
+		url := fmt.Sprintf("%s/blob/%s/%s", d.RepositoryURL, d.DefaultBranch, d.FilePath)
+		for _, kind := range d.Kinds {
+			if _, ok := kindToUrls[kind]; !ok {
+				kindToUrls[kind] = []string{url}
+			} else {
+				kindToUrls[kind] = append(kindToUrls[kind], url)
+			}
+		}
+	}
+	fmt.Printf("There are totally %d kinds of %s\n", len(kindToUrls), fileType)
+	sortedKeys := SortMapKeyByValueLen(kindToUrls)
+	for _, k := range sortedKeys {
+		sort.Strings(kindToUrls[k])
+		fmt.Printf("%s kind %s appears %d times\n", fileType, k, len(kindToUrls[k]))
+		for _, url := range kindToUrls[k] {
+			fmt.Printf("%s\n", url)
 		}
 	}
-
-	fileCount, dirCount, fileTypeDocs, dirTypeDocs := DocumentTypeSummary(ctx, allReferredDocs.Documents())
-
-	// check whether any of the files are not in the index
-	nonExistFileCount := ExistInIndex(idx, fileTypeDocs, fieldName + " file ")
-	// check whether any of the dirs are not in the index
-	nonExistDirCount := ExistInIndex(idx, dirTypeDocs, fieldName + " dir ")
-
-	GitRepositorySummary(fileTypeDocs, fieldName + " files")
-	GitRepositorySummary(dirTypeDocs, fieldName + " dirs")
-
-	fmt.Printf("%d kustomization files use %s: %d %s are files and %d %s are dirs.\n",
-		docCount, fieldName, fileCount, fieldName, dirCount, fieldName)
-	fmt.Printf("%d %s files do not exist in the index\n", nonExistFileCount, fieldName)
-	fmt.Printf("%d %s dirs do not exist in the index\n", nonExistDirCount, fieldName)
 }

 // GitRepositorySummary counts the distribution of docs:
 // 1) how many git repositories are these docs from?
 // 2) how many docs are from each git repository?
-func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
+func GitRepositorySummary(docs []*doc.KustomizationDocument, fileType string) {
 	m := make(map[string]int)
 	for _, d := range docs {
 		if _, ok := m[d.RepositoryURL]; ok {
@@ -104,65 +103,16 @@ func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
 			m[d.RepositoryURL] = 1
 		}
 	}
-	sortedKeys := SortMapKeyByValue(m)
+	sortedKeys := SortMapKeyByValueInt(m)
+	topN := 10
+	i := 0
 	for _, k := range sortedKeys {
-		fmt.Printf("%d %s are from %s\n", m[k], msgPrefix, k)
-	}
-}
-
-// ExistInIndex goes through each Document in docs, and check whether it is in the index or not.
-// It returns the number of documents which does not exist in the index.
-func ExistInIndex(idx *index.KustomizeIndex, docs []*doc.Document, msgPrefix string) int {
-	nonExistCount := 0
-	for _, d := range docs {
-		exists, err := idx.Exists(d.ID())
-		if err != nil {
-			log.Println(err)
-		}
-		if !exists {
-			log.Printf("%s (%s) does not exist in the index", msgPrefix, d.Path())
-			nonExistCount++
+		if i >= topN {
+			break
 		}
+		fmt.Printf("%d %s are from %s\n", m[k], fileType, k)
+		i++
 	}
-	return nonExistCount
-}
-
-// DocumentTypeSummary goes through each doc in docs, and determines whether it is a file or dir.
-func DocumentTypeSummary(ctx context.Context, docs []*doc.Document) (
-	fileCount, dirCount int, files, dirs []*doc.Document) {
-	githubToken := os.Getenv(githubAccessTokenVar)
-	if githubToken == "" {
-		log.Fatalf("Must set the variable '%s' to make github requests.\n",
-			githubAccessTokenVar)
-	}
-	ghCrawler := github.NewCrawler(githubToken, retryCount, &http.Client{}, github.QueryWith())
-
-	for _, d := range docs {
-		oldFilePath := d.FilePath
-		if err := ghCrawler.FetchDocument(ctx, d); err != nil {
-			log.Printf("FetchDocument failed on %s: %v", d.Path(), err)
-			continue
-		}
-
-		if d.FilePath == oldFilePath {
-			fileCount++
-			files = append(files, d)
-		} else {
-			dirCount++
-			dirs = append(dirs, d)
-		}
-	}
-	return fileCount, dirCount, files, dirs
-}
-
-// ExistInSlice checks where target exits in items.
-func ExistInSlice(items []string, target string) bool {
-	for _, item := range items {
-		if item == target {
-			return true
-		}
-	}
-	return false
 }

 func main() {
@@ -204,17 +154,26 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
 	// ids tracks the unique IDs of the documents in the index
 	ids := make(map[string]struct{})

-	// generatorDocs includes all the docs using generators
-	generatorDocs := make([]*doc.Document, 0)
+	// generatorFiles include all the non-kustomization files whose FileType is generator
+	generatorFiles := make([]*doc.KustomizationDocument, 0)

-	// transformersDocs includes all the docs using transformers
-	transformersDocs := make([]*doc.Document, 0)
+	// transformersFiles include all the non-kustomization files whose FileType is transformer
+	transformersFiles := make([]*doc.KustomizationDocument, 0)
+
+	checksums := make(map[string]int)

 	// get all the documents in the index
 	query := []byte(`{ "query":{ "match_all":{} } }`)
 	it := idx.IterateQuery(query, 10000, 60*time.Second)
 	for it.Next() {
 		for _, hit := range it.Value().Hits.Hits {
+			sum := fmt.Sprintf("%x", sha256.Sum256([]byte(hit.Document.DocumentData)))
+			if _, ok := checksums[sum]; ok {
+				checksums[sum]++
+			} else {
+				checksums[sum] = 1
+			}
+
 			// check whether there is any duplicate IDs in the index
 			if _, ok := ids[hit.ID]; !ok {
 				ids[hit.ID] = struct{}{}
@@ -229,11 +188,13 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
 			if doc.IsKustomizationFile(hit.Document.FilePath) {
 				kustomizationFilecount++
 				iterateArr(hit.Document.Identifiers, kustomizeIdentifiersMap)
-				if ExistInSlice(hit.Document.Identifiers, "generators") {
-					generatorDocs = append(generatorDocs, hit.Document.Copy())
-				}
-				if ExistInSlice(hit.Document.Identifiers, "transformers") {
-					transformersDocs = append(transformersDocs, hit.Document.Copy())
+
+			} else {
+				switch hit.Document.FileType {
+				case "generator":
+					generatorFiles = append(generatorFiles, hit.Document.Copy())
+				case "transformer":
+					transformersFiles = append(transformersFiles, hit.Document.Copy())
 				}
 			}
 		}
@@ -243,9 +204,9 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
 		log.Fatalf("Error iterating: %v\n", err)
 	}

-	sortedKindsMapKeys := SortMapKeyByValue(kindsMap)
-	sortedIdentifiersMapKeys := SortMapKeyByValue(identifiersMap)
-	sortedKustomizeIdentifiersMapKeys := SortMapKeyByValue(kustomizeIdentifiersMap)
+	sortedKindsMapKeys := SortMapKeyByValueInt(kindsMap)
+	sortedIdentifiersMapKeys := SortMapKeyByValueInt(identifiersMap)
+	sortedKustomizeIdentifiersMapKeys := SortMapKeyByValueInt(kustomizeIdentifiersMap)

 	fmt.Printf(`The count of unique document IDs in the kustomize index: %d
 There are %d documents in the kustomize index.
@@ -280,6 +241,14 @@ There are %d documents in the kustomize index.
 		}
 	}

-	GeneratorOrTransformerStats(ctx, generatorDocs, true, idx)
-	GeneratorOrTransformerStats(ctx, transformersDocs, false, idx)
+	GeneratorOrTransformerStats(generatorFiles)
+	GeneratorOrTransformerStats(transformersFiles)
+
+	fmt.Printf("There are total %d checksums of document contents\n", len(checksums))
+	sortedChecksums := SortMapKeyByValueInt(checksums)
+	sortedChecksums = sortedChecksums[:20]
+	fmt.Printf("The top 20 checksums are:\n")
+	for _, key := range sortedChecksums {
+		fmt.Printf("checksum %s apprears %d\n", key, checksums[key])
+	}
 }
--- a/api/internal/crawl/config/crawler/cronjob/cronjob.yaml
+++ b/api/internal/crawl/config/crawler/cronjob/cronjob.yaml
@@ -1,9 +1,10 @@
 apiVersion: batch/v1beta1
 kind: CronJob
 metadata:
-  name: crawler
+  name: crawler-cronjob
 spec:
-  schedule: "5 0 * * */1"
+  # run the cronjob at 00:00 every 7 days
+  schedule: "0 0 */7 * *"
  jobTemplate:
    spec:
      template:
@@ -11,7 +12,9 @@ spec:
          restartPolicy: OnFailure
          containers:
          - name: crawler
-            image: gcr.io/kustomize-search/crawler:latest
+            image: gcr.io/haiyanmeng-gke-dev/crawler:v1
+            command: ["/crawler"]
+            args: ["--mode=index+github", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"]
            imagePullPolicy: Always
            env:
            - name: GITHUB_ACCESS_TOKEN
--- a/api/internal/crawl/config/elastic/esbackup.yaml
+++ b/api/internal/crawl/config/elastic/esbackup.yaml
@@ -0,0 +1,16 @@
+# Creating `esbackup/kustomize-backbup` will create the `kustomize-backup` snapshot repository.
+# Deleting `esbackup/kustomize-backbup` will delete the `kustomize-backup` snapshot repository and all the snapshots in the repository.
+# Deleting `esbackup/kustomize-backbup` will NOT delete essnapshot and esrestore objects.
+apiVersion: elasticsearch.cloud.google.com/v1alpha1
+kind: ESBackup
+metadata:
+  name: kustomize-backup
+spec:
+  storage:
+    gcs:
+      bucket: kustomize-backup
+      path: kustomize
+      secret:
+        name: kustomizesa
+  escluster:
+    name: esbasic
--- a/api/internal/crawl/config/elastic/escluster.yaml
+++ b/api/internal/crawl/config/elastic/escluster.yaml
@@ -8,6 +8,13 @@ spec:
    - repository-gcs
    - ingest-user-agent
    - ingest-geoip
+  # To set `gcpserviceaccount`,
+  # First, create and download a GCP service account into a json file, named `sakey.json` following the instruction:
+  #     https://www.elastic.co/guide/en/elasticsearch/plugins/6.5/repository-gcs-usage.html#repository-gcs-using-service-account
+  # Second, create a secret for the service account using the following command:
+  #     $ kubectl create secret generic kustomizesa --from-file=./sakey.json
+  gcpserviceaccount:
+    name: kustomizesa
  config:
    env:
      example: test
--- a/api/internal/crawl/config/elastic/esrestore.yaml
+++ b/api/internal/crawl/config/elastic/esrestore.yaml
@@ -0,0 +1,16 @@
+# Creating `esrestore/kustomize-restore` will restore the `kuostmize` index in the `kustomize-snapshot` snapshot to a new index named `kusotmize-restore`.
+# Deleting `esrestore/kustomize-restore` will not delete the restored index.
+# Deleting `esrestore/kustomize-restore` should happen before deleting `essnapshot/kustomize-snapshot`.
+apiVersion: elasticsearch.cloud.google.com/v1alpha1
+kind: ESRestore
+metadata:
+  name: kustomize-restore
+spec:
+  include_global_state: true
+  ignore_unavailable: true
+  rename_pattern: kustomize
+  rename_replacement: kustomize-restore
+  essnapshot:
+    name: kustomize-snapshot
+  escluster:
+    name: esbasic
--- a/api/internal/crawl/config/elastic/essnaptshot.yaml
+++ b/api/internal/crawl/config/elastic/essnaptshot.yaml
@@ -0,0 +1,15 @@
+# Creating `essnapshot/kustomize-snapshot` will create a snapshot named `kustomize-snapshot` in the `kustomize-backup` snapshot repository.
+# Deleting `essnapshot/kustomize-snapshot` will delete the snapshot.
+# Deleting `essnapshot/kustomize-snapshot` should happen before deleting `esbackup/kustomize-backup`.
+apiVersion: elasticsearch.cloud.google.com/v1alpha1
+kind: ESSnapshot
+metadata:
+  name: kustomize-snapshot
+spec:
+  # indices are optional. If not specified all indices are selected.
+  indices:
+  - kustomize
+  include_global_state: true
+  ignore_unavailable: true
+  esbackup:
+    name: kustomize-backup
--- a/api/internal/crawl/crawler/github/crawler.go
+++ b/api/internal/crawl/crawler/github/crawler.go
@@ -80,6 +80,36 @@ func (gc githubCrawler) DefaultBranch(repo string) string {
 func (gc githubCrawler) Crawl(ctx context.Context,
 	output chan<- crawler.CrawledDocument, seen utils.SeenMap) error {

+	ranges := []RangeWithin{
+		RangeWithin{
+		start: uint64(0),
+		end:   githubMaxFileSize,
+	},
+	}
+
+	errs := make(multiError, 0)
+	for len(ranges) > 0 {
+		tailRange := ranges[len(ranges) - 1]
+		ranges = ranges[:(len(ranges) - 1)]
+		reProcessQueryRanges, err := gc.CrawlSingleRange(ctx, output, seen, tailRange.start, tailRange.end)
+		if err != nil {
+			errs = append(errs, err)
+		}
+		ranges = append(ranges, reProcessQueryRanges...)
+	}
+
+	if len(errs) > 0 {
+		return errs
+	}
+	return nil
+}
+
+func (gc githubCrawler) CrawlSingleRange(ctx context.Context,
+	output chan<- crawler.CrawledDocument, seen utils.SeenMap,
+	lowerBound, upperBound uint64) ([]RangeWithin, error) {
+
+	log.Printf("CrawlSingleRange [%d, %d]", lowerBound, upperBound)
+
 	noETagClient := GhClient{
 		RequestConfig: gc.client.RequestConfig,
 		client:        &http.Client{Timeout: gc.client.client.Timeout},
@@ -87,13 +117,16 @@ func (gc githubCrawler) Crawl(ctx context.Context,
 		accessToken:   gc.client.accessToken,
 	}

+	var reProcessQueryRanges []RangeWithin
+
 	var ranges []string
 	var err error
 	// Since Github returns a max of 1000 results per query, we can use
 	// multiple queries that split the search space into chunks of at most
 	// 1000 files to get all of the data.
 	for i := 0; i < 5; i++ {
-		ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query))
+		ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query),
+			lowerBound, upperBound)
 		if err == nil {
 			logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i)
 			break
@@ -102,7 +135,7 @@ func (gc githubCrawler) Crawl(ctx context.Context,
 		}
 	}
 	if err != nil {
-		return fmt.Errorf("could not split %v into ranges, %v\n",
+		return reProcessQueryRanges, fmt.Errorf("could not split %v into ranges, %v\n",
 			gc.query, err)
 	}

@@ -112,20 +145,23 @@ func (gc githubCrawler) Crawl(ctx context.Context,
 	errs := make(multiError, 0)
 	queryResult := RangeQueryResult{}
 	for _, query := range ranges {
-		rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap)
+		reProcessQuery, rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap)
 		if err != nil {
 			errs = append(errs, err)
 		}
 		queryResult.Add(rangeResult)
+		if reProcessQuery {
+			reProcessQueryRanges = append(reProcessQueryRanges, RangeSizes(query))
+		}
 	}

 	logger.Printf("Summary of Crawl: %s", queryResult.String())

 	if len(errs) > 0 {
-		return errs
+		return reProcessQueryRanges, errs
 	}

-	return nil
+	return reProcessQueryRanges, nil
 }

 // FetchDocument first tries to fetch the document with d.FilePath. If it fails,
@@ -225,7 +261,7 @@ func (r *RangeQueryResult) String() string {
 // documents from the crawl to the datastore/index.
 func processQuery(ctx context.Context, gcl GhClient, query string,
 	output chan<- crawler.CrawledDocument, seen utils.SeenMap,
-	branchMap map[string]string) (RangeQueryResult, error) {
+	branchMap map[string]string) (bool, RangeQueryResult, error) {

 	queryPages := make(chan GhResponseInfo)

@@ -241,6 +277,8 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
 		close(queryPages)
 	}()

+	reProcessQuery := false
+
 	errs := make(multiError, 0)
 	result := RangeQueryResult{}
 	pageID := 1
@@ -271,11 +309,15 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
 		result.Add(pageResult)

 		pageID++
+
+		if page.Parsed.TotalCount > githubMaxResultsPerQuery {
+			reProcessQuery = true
+		}
 	}

 	logger.Printf("Summary of processQuery: %s", result.String())

-	return result, errs
+	return reProcessQuery, result, errs
 }

 func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
@@ -337,7 +379,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string,
 	output chan<- GhResponseInfo) error {

 	logger.Println("querying: ", query)
-	response := gcl.parseGithubResponse(query)
+	response := gcl.parseGithubResponseWithRetry(query)

 	if response.Error != nil {
 		return response.Error
@@ -350,7 +392,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string,
 		case <-ctx.Done():
 			return nil
 		default:
-			response = gcl.parseGithubResponse(response.NextURL)
+			response = gcl.parseGithubResponseWithRetry(response.NextURL)
 			if response.Error != nil {
 				return response.Error
 			}
@@ -545,6 +587,8 @@ type githubResponse struct {
 	// This is the number of files that match the query.
 	TotalCount uint64 `json:"total_count,omitempty"`

+	IncompleteResults bool `json:"incomplete_results,omitempty"`
+
 	// Github representation of a file.
 	Items []GhFileSpec `json:"items,omitempty"`
 }
@@ -587,6 +631,17 @@ func parseGithubLinkFormat(links string) (string, string) {
 	return next, last
 }

+func (gcl GhClient) parseGithubResponseWithRetry(getRequest string) GhResponseInfo {
+	resp := gcl.parseGithubResponse(getRequest)
+	retries := 0
+	for resp.Parsed.IncompleteResults {
+		resp = gcl.parseGithubResponse(getRequest)
+		retries++
+	}
+	log.Printf("The result of query(%s) is complete after %d retries", getRequest, retries)
+	return resp
+}
+
 func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo {
 	resp, err := gcl.SearchGithubAPI(getRequest)
 	requestInfo := GhResponseInfo{
--- a/api/internal/crawl/crawler/github/split_search_ranges.go
+++ b/api/internal/crawl/crawler/github/split_search_ranges.go
@@ -100,6 +100,8 @@ package github
 import (
 	"fmt"
 	"math/bits"
+	"strconv"
+	"strings"
 )

 // Files cannot be more than 2^19 bytes, according to
@@ -112,7 +114,7 @@ const (
 // Interface instead of struct for testing purposes.
 // Not expecting to have multiple implementations.
 type cachedSearch interface {
-	CountResults(uint64) (uint64, error)
+	CountResults(uint64, uint64) (uint64, error)
 	RequestString(filesize rangeFormatter) string
 }

@@ -161,16 +163,16 @@ func newCache(client GhClient, query Query) githubCachedSearch {
 	}
 }

-func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
+func (c githubCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) {
 	count, cached := c.cache[upperBound]
 	if cached {
 		return count, nil
 	}

-	sizeRange := RangeWithin{0, upperBound}
+	sizeRange := RangeWithin{lowerBound, upperBound}
 	rangeRequest := c.RequestString(sizeRange)

-	result := c.gcl.parseGithubResponse(rangeRequest)
+	result := c.gcl.parseGithubResponseWithRetry(rangeRequest)
 	if result.Error != nil {
 		return count, result.Error
 	}
@@ -204,7 +206,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
 			"Retrying query... current lower bound: %d, got: %d\n",
 			c.cache[prev], result.Parsed.TotalCount)

-		result = c.gcl.parseGithubResponse(rangeRequest)
+		result = c.gcl.parseGithubResponseWithRetry(rangeRequest)
 		if result.Error != nil {
 			return count, result.Error
 		}
@@ -219,8 +221,8 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
 	}

 	count = result.Parsed.TotalCount
-	logger.Printf("Caching new query %s, with count %d\n",
-		sizeRange.RangeString(), count)
+	logger.Printf("Caching new query %s, with count %d (incomplete_results: %v)\n",
+		sizeRange.RangeString(), count, result.Parsed.IncompleteResults)
 	c.cache[upperBound] = count
 	return count, nil
 }
@@ -238,8 +240,8 @@ func (c githubCachedSearch) RequestString(filesize rangeFormatter) string {
 // This would mean that the search as it is could not find all files. If queries
 // are sorted by last indexed, and retrieved on regular intervals, it should be
 // sufficient to get most if not all documents.
-func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
-	totalFiles, err := cache.CountResults(githubMaxFileSize)
+func FindRangesForRepoSearch(cache cachedSearch, lowerBound, upperBound uint64) ([]string, error) {
+	totalFiles, err := cache.CountResults(lowerBound, upperBound)
 	if err != nil {
 		return nil, err
 	}
@@ -247,7 +249,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {

 	if githubMaxResultsPerQuery >= totalFiles {
 		return []string{
-			cache.RequestString(RangeWithin{0, githubMaxFileSize}),
+			cache.RequestString(RangeWithin{lowerBound, upperBound}),
 		}, nil
 	}

@@ -275,6 +277,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
 	// range.
 	filesAccessible := uint64(0)
 	sizes := make([]uint64, 0)
+	sizes = append(sizes, lowerBound)
 	for filesAccessible < totalFiles {
 		target := filesAccessible + githubMaxResultsPerQuery
 		if target >= totalFiles {
@@ -284,22 +287,22 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
 		logger.Printf("%d accessible files, next target = %d\n",
 			filesAccessible, target)

-		cur, err := lowerBoundFileCount(cache, target)
+		size, err := FindFileSize(cache, target, lowerBound, upperBound)
 		if err != nil {
 			return nil, err
 		}

 		// If there are more than 1000 files in the next bucket, we must
 		// advance anyway and lose out on some files :(.
-		if l := len(sizes); l > 0 && sizes[l-1] == cur {
-			cur++
+		if l := len(sizes); l > 0 && sizes[l-1] == size {
+			size++
 		}

-		nextAccessible, err := cache.CountResults(cur)
+		nextAccessible, err := cache.CountResults(lowerBound, size)
 		if err != nil {
 			return nil, fmt.Errorf(
 				"cache should be populated at %d already, got %v",
-				cur, err)
+				size, err)
 		}
 		if nextAccessible < filesAccessible {
 			return nil, fmt.Errorf(
@@ -309,31 +312,31 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {

 		filesAccessible = nextAccessible
 		if nextAccessible < totalFiles {
-			sizes = append(sizes, cur)
+			sizes = append(sizes, size)
 		}
 	}
-
+	sizes = append(sizes, upperBound)
 	return formatFilesizeRanges(cache, sizes), nil
 }

-// lowerBoundFileCount finds the filesize range from [0, return value] that has
+// FindFileSize finds the filesize range from [lowerBound, return value] that has
 // the largest file count that is smaller than or equal to
 // githubMaxResultsPerQuery. It is important to note that this returned value
 // could already be in a previous range if the next file size has more than 1000
 // results. It is left to the caller to handle this bit of logic and guarantee
 // forward progession in this case.
-func lowerBoundFileCount(
-	cache cachedSearch, targetFileCount uint64) (uint64, error) {
+func FindFileSize(
+	cache cachedSearch, targetFileCount, lowerBound, upperBound uint64) (uint64, error) {

 	// Binary search for file sizes that make up the next <=1000 element
 	// chunk.
-	cur := uint64(0)
-	increase := githubMaxFileSize / 2
+	cur := lowerBound
+	increase := (upperBound - lowerBound) / 2

 	for increase > 0 {
 		mid := cur + increase

-		count, err := cache.CountResults(mid)
+		count, err := cache.CountResults(lowerBound, mid)
 		if err != nil {
 			return count, err
 		}
@@ -353,26 +356,24 @@ func lowerBoundFileCount(
 }

 func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string {
-	ranges := make([]string, 0, len(sizes)+1)
-
-	if len(sizes) > 0 {
-		ranges = append(ranges, cache.RequestString(
-			RangeLessThan{sizes[0] + 1},
-		))
+	n := len(sizes)
+	if n < 2 {
+		return []string{}
 	}

-	for i := 0; i < len(sizes)-1; i += 1 {
-		ranges = append(ranges, cache.RequestString(
-			RangeWithin{sizes[i] + 1, sizes[i+1]},
-		))
-
-		if i != len(sizes)-2 {
-			continue
-		}
-		ranges = append(ranges, cache.RequestString(
-			RangeGreaterThan{sizes[i+1]},
-		))
+	ranges := make([]string, 0, n-1)
+	ranges = append(ranges, cache.RequestString(RangeWithin{sizes[0], sizes[1]}))
+	for i := 1; i < n-1; i++ {
+		ranges = append(ranges, cache.RequestString(RangeWithin{sizes[i] + 1, sizes[i+1]}))
 	}
-
 	return ranges
 }
+
+func RangeSizes(s string) RangeWithin {
+	start := strings.Index(s, "+size:") + len("+size:")
+	end := strings.Index(s, "&")
+	ranges := strings.Split(s[start:end], "..")
+	lowerBound, _ := strconv.ParseUint(ranges[0], 10, 64)
+	upperBound, _ := strconv.ParseUint(ranges[1], 10, 64)
+	return RangeWithin{lowerBound, upperBound}
+}
--- a/api/internal/crawl/crawler/github/split_search_ranges_test.go
+++ b/api/internal/crawl/crawler/github/split_search_ranges_test.go
@@ -11,7 +11,7 @@ type testCachedSearch struct {
 	cache map[uint64]uint64
 }

-func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
+func (c testCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) {
 	log.Printf("CountResults(%05x)\n", upperBound)
 	count, ok := c.cache[upperBound]
 	if !ok {
@@ -73,19 +73,29 @@ func TestRangeSplitting(t *testing.T) {
 		},
 	}

-	requests, err := FindRangesForRepoSearch(cache)
+	requests, err := FindRangesForRepoSearch(cache, 0, 524288)
 	if err != nil {
 		t.Errorf("Error while finding ranges: %v", err)
 	}
 	expected := []string{
-		"<107",      // cache.RequestString(RangeLessThan{0x6b}),
-		"107..128",  // cache.RequestString(RangeWithin{0x6b, 0x80}),
-		"129..256",  // cache.RequestString(RangeWithin{0x81, 0x100}),
-		"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
-		">4095",     // cache.RequestString(RangeGreaterThan{0xfff}),
+		"0..106",       // cache.RequestString(RangeWithin{0x00, 0x6a}),
+		"107..128",     // cache.RequestString(RangeWithin{0x6b, 0x80}),
+		"129..256",     // cache.RequestString(RangeWithin{0x81, 0x100}),
+		"257..4095",    // cache.RequestString(RangeWithin{0x101, 0xfff}),
+		"4096..524288", // cache.RequestString(RangeWithin{0x1000, 0x80000}),
 	}

 	if !reflect.DeepEqual(requests, expected) {
 		t.Errorf("Expected requests (%v) to equal (%v)", requests, expected)
 	}
 }
+
+func TestRangeSizes(t *testing.T) {
+	s := "https://api.github.com/search/code?q=filename:kustomization.yaml+filename:kustomization.yml" +
+		"+filename:kustomization+size:2365..10000&order=desc&per_page=100&sort=indexed"
+	returnedResult := RangeSizes(s)
+	expectedResult := RangeWithin{uint64(2365), uint64(10000)}
+	if !reflect.DeepEqual(returnedResult, expectedResult) {
+		t.Errorf("RangeSizes expected (%v), got (%v)",expectedResult, returnedResult)
+	}
+}
--- a/api/internal/crawl/doc/doc.go
+++ b/api/internal/crawl/doc/doc.go
@@ -46,6 +46,15 @@ type KustomizationDocument struct {

 type set map[string]struct{}

+func (doc *KustomizationDocument) Copy() *KustomizationDocument {
+	return &KustomizationDocument{
+		Document:    *(doc.Document.Copy()),
+		Kinds:       doc.Kinds,
+		Identifiers: doc.Identifiers,
+		Values:      doc.Values,
+	}
+}
+
 func (doc *KustomizationDocument) String() string {
 	return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v  len(values):%v",
 		doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime,
--- a/api/internal/crawl/search_cmds/generator.md
+++ b/api/internal/crawl/search_cmds/generator.md
@@ -0,0 +1,29 @@
+Find all the generator files whose `kinds` field includes `ChartRenderer`, and
+only output certain fields of each document:
+```
+curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
+{
+  "size": 200,
+  "_source": {
+    "includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"]
+  },
+  "query": {
+    "bool": {
+      "filter": [
+       { "regexp": { "fileType": "generator" }}
+      ],
+      "must_not": {
+        "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*"  }
+      },
+      "must": {
+        "match" : {
+          "kinds" : {
+            "query" : "ChartRenderer"
+          }
+        }
+      }
+    }
+  }
+}
+'
+```
--- a/api/internal/crawl/search_cmds/snapshot.md
+++ b/api/internal/crawl/search_cmds/snapshot.md
@@ -0,0 +1,29 @@
+Retrieve information about all registered snapshot repositories:
+```
+curl -X GET "${ElasticSearchURL}:9200/_snapshot?pretty"
+```
+
+Retrieve information about a given snapshot repository, `kustomize-backup`:
+```
+curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup?pretty"
+```
+
+Verify a snapshot repository, `kustomize-backup`, manually:
+```
+curl -X POST "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/_verify?pretty"
+```
+
+List all the snapshots in a given snapshot repository:
+```
+curl -X GET "${ElasticSearchURL}:9200/_cat/snapshots/kustomize-backup?v&s=id&pretty"
+```
+
+Retrieve a summary information about a given snapshot:
+```
+curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot?pretty"
+```
+
+Retrieve a detailed information about a given snapshot:
+```
+curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot/_status?pretty"
+```
--- a/api/internal/crawl/search_cmds/transformer.md
+++ b/api/internal/crawl/search_cmds/transformer.md
@@ -0,0 +1,29 @@
+Find all the trasnformer files whose `kinds` field includes `HelmValues`, and
+only output certain fields of each document:
+```
+curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
+{
+  "size": 200,
+  "_source": {
+    "includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"]
+  },
+  "query": {
+    "bool": {
+      "filter": [
+       { "regexp": { "fileType": "transformer" }}
+      ],
+      "must_not": {
+        "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*"  }
+      },
+      "must": {
+        "match" : {
+          "kinds" : {
+            "query" : "HelmValues"
+          }
+        }
+      }
+    }
+  }
+}
+'
+```