Introduce dummy program to help with API releases.

2026-06-11 09:02:53 +00:00 · 2019-10-07 21:21:12 -07:00
parent c1d20546ec
commit 78d14d0d75
110 changed files with 135 additions and 101 deletions
--- a/internal/crawl/crawler/crawler.go
+++ b/internal/crawl/crawler/crawler.go
@@ -0,0 +1,236 @@
+// Package crawler provides helper methods and defines an interface for lauching
+// source repository crawlers that retrieve files from a source and forwards
+// to a channel for indexing and retrieval.
+package crawler
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"sync"
+
+	_ "github.com/gomodule/redigo/redis"
+
+	"sigs.k8s.io/kustomize/internal/tools/doc"
+)
+
+var (
+	logger = log.New(os.Stdout, "Crawler: ", log.LstdFlags|log.LUTC|log.Llongfile)
+)
+
+// Crawler forwards documents from source repositories to index and store them
+// for searching. Each crawler is responsible for querying it's source of
+// information, and forwarding files that have not been seen before or that need
+// updating.
+type Crawler interface {
+	// Crawl returns when it is done processing. This method does not take
+	// ownership of the channel. The channel is write only, and it
+	// designates where the crawler should forward the documents.
+	Crawl(ctx context.Context, output chan<- CrawlerDocument) error
+
+	// Get the document data given the FilePath, Repo, and Ref/Tag/Branch.
+	FetchDocument(context.Context, *doc.Document) error
+	// Write to the document what the created time is.
+	SetCreated(context.Context, *doc.Document) error
+
+	Match(*doc.Document) bool
+}
+
+type CrawlerDocument interface {
+	ID() string
+	GetDocument() *doc.Document
+	GetResources() ([]*doc.Document, error)
+	WasCached() bool
+}
+
+type CrawlerSeed []*doc.Document
+
+type IndexFunc func(CrawlerDocument, Crawler) error
+type Converter func(*doc.Document) (CrawlerDocument, error)
+
+// Cleaner, more efficient, and more extensible crawler implementation.
+// The seed must include the ids of each document in the index.
+func CrawlFromSeed(ctx context.Context, seed CrawlerSeed,
+	crawlers []Crawler, conv Converter, indx IndexFunc) {
+
+	seen := make(map[string]struct{})
+
+	logIfErr := func(err error) {
+		if err == nil {
+			return
+		}
+		logger.Println("error: ", err)
+	}
+
+	stack := make(CrawlerSeed, 0)
+
+	findMatch := func(d *doc.Document) Crawler {
+		for _, crawl := range crawlers {
+			if crawl.Match(d) {
+				return crawl
+			}
+		}
+
+		return nil
+	}
+
+	addBranches := func(cdoc CrawlerDocument, match Crawler) {
+		if _, ok := seen[cdoc.ID()]; ok {
+			return
+		}
+
+		seen[cdoc.ID()] = struct{}{}
+		// Insert into index
+		err := indx(cdoc, match)
+		logIfErr(err)
+		if err != nil {
+			return
+		}
+
+		deps, err := cdoc.GetResources()
+		logIfErr(err)
+		if err != nil {
+			return
+		}
+		for _, dep := range deps {
+			if _, ok := seen[dep.ID()]; ok {
+				continue
+			}
+			stack = append(stack, dep)
+		}
+	}
+
+	doCrawl := func(docsPtr *CrawlerSeed) {
+		for len(*docsPtr) > 0 {
+			back := len(*docsPtr) - 1
+			next := (*docsPtr)[back]
+			*docsPtr = (*docsPtr)[:back]
+
+			match := findMatch(next)
+			if match == nil {
+				logIfErr(fmt.Errorf(
+					"%v could not match any crawler", next))
+				continue
+			}
+
+			err := match.FetchDocument(ctx, next)
+			logIfErr(err)
+			// If there was no change or there is an error, we don't have
+			// to branch out, since the dependencies are already in the
+			// index, or we cannot find the document.
+			if err != nil || next.WasCached() {
+				continue
+			}
+
+			cdoc, err := conv(next)
+			logIfErr(err)
+			if err != nil {
+				continue
+			}
+
+			addBranches(cdoc, match)
+		}
+	}
+	// Exploit seed to update bulk of corpus.
+	logger.Printf("updating %d documents from seed\n", len(seed))
+	doCrawl(&seed)
+	// Traverse any new links added while updating corpus.
+	logger.Printf("crawling %d new documents found in the seed\n", len(stack))
+	doCrawl(&stack)
+
+	ch := make(chan CrawlerDocument, 1<<10)
+	wg := sync.WaitGroup{}
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for cdoc := range ch {
+			if _, ok := seen[cdoc.ID()]; ok {
+				continue
+			}
+			match := findMatch(cdoc.GetDocument())
+			if match == nil {
+				logIfErr(fmt.Errorf(
+					"%v could not match any crawler", cdoc))
+				continue
+			}
+			addBranches(cdoc, match)
+		}
+	}()
+
+	// Exploration through APIs.
+	errs := CrawlerRunner(ctx, ch, crawlers)
+	if errs != nil {
+		for _, err := range errs {
+			logIfErr(err)
+		}
+	}
+	close(ch)
+	logger.Println("Processing the new documents from the crawlers' exploration.")
+	wg.Wait()
+	// Handle deps of newly discovered documents.
+	logger.Printf("crawling the %d new documents from the crawlers' exploration.",
+		len(stack))
+	doCrawl(&stack)
+}
+
+// CrawlerRunner is a blocking function and only returns once all of the
+// crawlers are finished with execution.
+//
+// This function uses the output channel to forward kustomization documents
+// from a list of crawlers. The output is to be consumed by a database/search
+// indexer for later retrieval.
+//
+// The return value is an array of errors in which each index represents the
+// index of the crawler that emitted the error. Although the errors themselves
+// can be nil, the array will always be exactly the size of the crawlers array.
+//
+// Crawler Runner takes in a seed, which represents the documents stored in an
+// index somewhere. The document data is not required to be populated. If there
+// are many documents, this is preferable. The order of iteration over the seed
+// is not garanteed, but the CrawlerRunner does guarantee that every element
+// from the seed will be processed before any other documents from the
+// crawlers.
+func CrawlerRunner(ctx context.Context,
+	output chan<- CrawlerDocument, crawlers []Crawler) []error {
+
+	errs := make([]error, len(crawlers))
+	wg := sync.WaitGroup{}
+
+	for i, crawler := range crawlers {
+		// Crawler implementations get their own channels to prevent a
+		// crawler from closing the main output channel.
+		docs := make(chan CrawlerDocument)
+		wg.Add(2)
+
+		// Forward all of the documents from this crawler's channel to
+		// the main output channel.
+		go func(docs <-chan CrawlerDocument) {
+			defer wg.Done()
+			for doc := range docs {
+				output <- doc
+			}
+		}(docs)
+
+		// Run this crawler and capture its returned error.
+		go func(idx int, crawler Crawler,
+			docs chan<- CrawlerDocument) {
+
+			defer func() {
+				wg.Done()
+				if r := recover(); r != nil {
+					errs[idx] = fmt.Errorf(
+						"%+v panicked: %v, additional error %v",
+						crawler, r, errs[idx],
+					)
+				}
+			}()
+			defer close(docs)
+			errs[idx] = crawler.Crawl(ctx, docs)
+		}(i, crawler, docs) // Copies the index and the crawler
+	}
+
+	wg.Wait()
+	return errs
+}
--- a/internal/crawl/crawler/crawler_test.go
+++ b/internal/crawl/crawler/crawler_test.go
@@ -0,0 +1,356 @@
+package crawler
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"reflect"
+	"sort"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"sigs.k8s.io/kustomize/internal/tools/doc"
+	"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
+)
+
+const (
+	kustomizeRepo = "https://github.com/kubernetes-sigs/kustomize"
+)
+
+// Simple crawler that forwards it's list of documents to a provided channel and
+// returns it's error to the caller.
+type testCrawler struct {
+	matchPrefix string
+	err         error
+	docs        []doc.KustomizationDocument
+	lukp        map[string]int
+}
+
+func (c testCrawler) Match(d *doc.Document) bool {
+	return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix)
+}
+
+func (c testCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
+	if i, ok := c.lukp[d.ID()]; ok {
+		d.DocumentData = c.docs[i].DocumentData
+		return nil
+	}
+	for _, suffix := range pgmconfig.KustomizationFileNames {
+		fmt.Println(d.ID(), "/", suffix)
+		i, ok := c.lukp[d.ID()+"/"+suffix]
+		if !ok {
+			continue
+		}
+		d.FilePath += "/" + suffix
+		d.DocumentData = c.docs[i].DocumentData
+		return nil
+	}
+	return fmt.Errorf("Document %v does not exist for matcher: %s",
+		d, c.matchPrefix)
+}
+
+func (c testCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
+	d.CreationTime = &time.Time{}
+	return nil
+}
+
+func newCrawler(matchPrefix string, err error,
+	docs []doc.KustomizationDocument) testCrawler {
+	c := testCrawler{
+		matchPrefix: matchPrefix,
+		err:         err,
+		docs:        docs,
+		lukp:        make(map[string]int),
+	}
+	for i, d := range docs {
+		c.lukp[d.ID()] = i
+	}
+	return c
+}
+
+// Crawl implements the Crawler interface for testing.
+func (c testCrawler) Crawl(ctx context.Context,
+	output chan<- CrawlerDocument) error {
+
+	for i, d := range c.docs {
+		isResource := true
+		for _, suffix := range pgmconfig.KustomizationFileNames {
+			if strings.HasSuffix(d.FilePath, suffix) {
+				isResource = false
+				break
+			}
+		}
+		if isResource {
+			continue
+		}
+		output <- &c.docs[i]
+	}
+	return c.err
+}
+
+// Used to make sure that we're comparing documents in order. This is needed
+// since these documents will be sent concurrently.
+type sortableDocs []doc.KustomizationDocument
+
+func (s sortableDocs) Less(i, j int) bool {
+	return s[i].FilePath < s[j].FilePath
+}
+
+func (s sortableDocs) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+
+func (s sortableDocs) Len() int {
+	return len(s)
+}
+
+func TestCrawlerRunner(t *testing.T) {
+	fmt.Println("testing CrawlerRunner")
+	tests := []struct {
+		tc   []Crawler
+		errs []error
+		docs sortableDocs
+	}{
+		{
+			tc: []Crawler{
+				testCrawler{
+					docs: []doc.KustomizationDocument{
+						{Document: doc.Document{
+							FilePath: "crawler1/doc1/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler1/doc2/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler1/doc3/kustomization.yaml",
+						}},
+					},
+				},
+				testCrawler{err: errors.New("crawler2")},
+				testCrawler{},
+				testCrawler{
+					docs: []doc.KustomizationDocument{
+						{Document: doc.Document{
+							FilePath: "crawler4/doc1/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler4/doc2/kustomization.yaml",
+						}},
+					},
+					err: errors.New("crawler4"),
+				},
+			},
+			errs: []error{
+				nil,
+				errors.New("crawler2"),
+				nil,
+				errors.New("crawler4"),
+			},
+			docs: sortableDocs{
+				{Document: doc.Document{
+					FilePath: "crawler1/doc1/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler1/doc2/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler1/doc3/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler4/doc1/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler4/doc2/kustomization.yaml",
+				}},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		output := make(chan CrawlerDocument)
+		wg := sync.WaitGroup{}
+		wg.Add(1)
+
+		// Run the Crawler runner with a list of crawlers.
+		go func() {
+			defer close(output)
+			defer wg.Done()
+
+			errs := CrawlerRunner(context.Background(),
+				output, test.tc)
+
+			// Check that errors are returned as they should be.
+			if !reflect.DeepEqual(errs, test.errs) {
+				t.Errorf("Expected errs (%v) to equal (%v)",
+					errs, test.errs)
+			}
+
+		}()
+
+		// Iterate over the output channel of Crawler runner.
+		returned := make(sortableDocs, 0, len(test.docs))
+		for o := range output {
+			d, ok := o.(*doc.KustomizationDocument)
+			if !ok || d == nil {
+				t.Errorf("%T not expected type (%T)",
+					o, d)
+			}
+			returned = append(returned, *d)
+		}
+
+		// Check that all documents are received.
+		sort.Sort(returned)
+		if !reflect.DeepEqual(returned, test.docs) {
+			t.Errorf("Expected docs (%v) to equal (%v)\n",
+				returned, test.docs)
+		}
+
+		wg.Wait()
+	}
+}
+
+func TestCrawlFromSeed(t *testing.T) {
+	fmt.Println("testing CrawlFromSeed")
+
+	tests := []struct {
+		seed    CrawlerSeed
+		matcher string
+		corpus  []doc.KustomizationDocument
+	}{
+		{
+			seed: CrawlerSeed{
+				{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/kustomization.yaml",
+				},
+				{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/kustomization.yaml",
+				},
+			},
+			matcher: kustomizeRepo,
+			corpus: []doc.KustomizationDocument{
+				// Visited from the seed, will be ignored in the crawl.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/kustomization.yaml",
+					DocumentData: `
+resources:
+- deployment.yaml
+`,
+				}},
+				// Also visited from the seed as a relative resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/deployment.yaml",
+					DocumentData: `
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hello
+`,
+				}},
+				// Visited from the seed. Has a remote import.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/kustomization.yaml",
+					DocumentData: `
+resources:
+- https://github.com/kubernetes-sigs/kustomize/examples/other/overlay
+- service.yaml
+`,
+				}},
+				// Imported as a base from the seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/overlay/kustomization.yaml",
+					DocumentData: `
+resources:
+- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl1
+- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl2
+`,
+				}},
+				// Imported as a resource from the seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/service.yaml",
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl1/kustomization.yml",
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl2/kustomization.yaml",
+					DocumentData: `
+resources:
+- ../base
+- job.yaml
+`,
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/base/kustomization.yml",
+				}},
+				// Visited from crawling seed imported as resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl2/job.yaml",
+				}},
+				// Visited from the crawler runner.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/base/kustomization.yaml",
+					DocumentData: `
+resources:
+- ../app
+`,
+				}},
+				// Visited from the crawler runner.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/app/kustomization.yaml",
+					DocumentData: `
+resources:
+- resource.yaml
+`,
+				}},
+				// Visited from crawling runner imported as resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/app/resource.yaml",
+				}},
+			},
+		},
+	}
+
+	for _, tc := range tests {
+		cr := newCrawler(tc.matcher, nil, tc.corpus)
+		visited := make(map[string]int)
+		CrawlFromSeed(context.Background(), tc.seed, []Crawler{cr},
+			func(d *doc.Document) (CrawlerDocument, error) {
+				return &doc.KustomizationDocument{
+					Document: *d,
+				}, nil
+			},
+			func(d CrawlerDocument, cr Crawler) error {
+				visited[d.ID()]++
+				return nil
+			},
+		)
+		if lv, lc := len(visited), len(tc.corpus); lv != lc {
+			t.Errorf("error: %d of %d documents visited.", lv, lc)
+			t.Errorf("\nvisited (%v)\nexpected (%v).", visited, cr.lukp)
+		}
+		for id, cnt := range visited {
+			if cnt != 1 {
+				t.Errorf("%s not visited once (%d)", id, cnt)
+			}
+		}
+	}
+}
--- a/internal/crawl/crawler/github/crawler.go
+++ b/internal/crawl/crawler/github/crawler.go
@@ -0,0 +1,582 @@
+// Package github implements the crawler.Crawler interface, getting data
+// from the Github search API.
+package github
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math"
+	"net/http"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+
+	"sigs.k8s.io/kustomize/internal/tools/crawler"
+	"sigs.k8s.io/kustomize/internal/tools/doc"
+	"sigs.k8s.io/kustomize/internal/tools/httpclient"
+	"sigs.k8s.io/kustomize/v3/pkg/git"
+	"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
+)
+
+var logger = log.New(os.Stdout, "Github Crawler: ",
+	log.LstdFlags|log.LUTC|log.Llongfile)
+
+// Implements crawler.Crawler.
+type githubCrawler struct {
+	client GitHubClient
+	query  Query
+}
+
+type GitHubClient struct {
+	RequestConfig
+	retryCount uint64
+	client     *http.Client
+}
+
+func NewClient(accessToken string, retryCount uint64, client *http.Client) GitHubClient {
+	return GitHubClient{
+		retryCount: retryCount,
+		client:     client,
+		RequestConfig: RequestConfig{
+			perPage:     githubMaxPageSize,
+			accessToken: accessToken,
+		},
+	}
+}
+
+func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
+	query Query) githubCrawler {
+
+	return githubCrawler{
+		client: GitHubClient{
+			retryCount: retryCount,
+			client:     client,
+			RequestConfig: RequestConfig{
+				perPage:     githubMaxPageSize,
+				accessToken: accessToken,
+			},
+		},
+		query: query,
+	}
+}
+
+// Implements crawler.Crawler.
+func (gc githubCrawler) Crawl(
+	ctx context.Context, output chan<- crawler.CrawlerDocument) error {
+
+	noETagClient := GitHubClient{
+		RequestConfig: gc.client.RequestConfig,
+		client:        &http.Client{Timeout: gc.client.client.Timeout},
+		retryCount:    gc.client.retryCount,
+	}
+
+	// Since Github returns a max of 1000 results per query, we can use
+	// multiple queries that split the search space into chunks of at most
+	// 1000 files to get all of the data.
+	ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
+	if err != nil {
+		return fmt.Errorf("could not split %v into ranges, %v\n",
+			gc.query, err)
+	}
+
+	logger.Println("ranges: ", ranges)
+
+	// Query each range for files.
+	errs := make(multiError, 0)
+	for _, query := range ranges {
+		err := processQuery(ctx, gc.client, query, output)
+		if err != nil {
+			errs = append(errs, err)
+		}
+	}
+
+	if len(errs) > 0 {
+		return errs
+	}
+
+	return nil
+}
+
+func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
+	repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
+	repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
+	if err != nil {
+		return fmt.Errorf("invalid repospec: %v", err)
+	}
+
+	url := "https://raw.githubusercontent.com/" + repoSpec.OrgRepo +
+		"/" + repoSpec.Ref + "/" + repoSpec.Path
+
+	handle := func(resp *http.Response, err error, path string) error {
+		if err == nil && resp.StatusCode == http.StatusOK {
+			d.IsSame = httpclient.FromCache(resp.Header)
+			defer resp.Body.Close()
+			data, err := ioutil.ReadAll(resp.Body)
+			if err != nil {
+				return err
+			}
+			d.DocumentData = string(data)
+			d.FilePath = d.FilePath + path
+			return nil
+		}
+		return err
+	}
+	resp, err := gc.client.GetRawUserContent(url)
+	if err := handle(resp, err, ""); err == nil {
+		return nil
+	}
+
+	for _, file := range pgmconfig.KustomizationFileNames {
+		resp, err = gc.client.GetRawUserContent(url + "/" + file)
+		err := handle(resp, err, "/"+file)
+		if err != nil {
+			continue
+		}
+	}
+	return fmt.Errorf("File Not Found: %s", url)
+}
+
+func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
+	fs := GithubFileSpec{}
+	fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
+	creationTime, err := gc.client.GetFileCreationTime(fs)
+	if err != nil {
+		return err
+	}
+	d.CreationTime = &creationTime
+	return nil
+}
+
+func (gc githubCrawler) Match(d *doc.Document) bool {
+	url := d.RepositoryURL + "/" + d.FilePath + "?ref=" + "/" +
+		d.DefaultBranch
+	repoSpec, err := git.NewRepoSpecFromUrl(url)
+	if err != nil {
+		return false
+	}
+
+	return strings.Contains(repoSpec.Host, "github.com")
+}
+
+// processQuery follows all of the pages in a query, and updates/adds the
+// documents from the crawl to the datastore/index.
+func processQuery(ctx context.Context, gcl GitHubClient, query string,
+	output chan<- crawler.CrawlerDocument) error {
+
+	queryPages := make(chan GithubResponseInfo)
+
+	go func() {
+		// Forward the document metadata to the retrieval channel.
+		// This separation allows for concurrent requests for the code
+		// search, and the retrieval portions of the API.
+		err := gcl.ForwardPaginatedQuery(ctx, query, queryPages)
+		if err != nil {
+			// TODO(damienr74) handle this error with redis?
+			logger.Println(err)
+		}
+		close(queryPages)
+	}()
+
+	errs := make(multiError, 0)
+	errorCnt := 0
+	totalCnt := 0
+	for page := range queryPages {
+		if page.Error != nil {
+			errs = append(errs, page.Error)
+			continue
+		}
+
+		for _, file := range page.Parsed.Items {
+			k, err := kustomizationResultAdapter(gcl, file)
+			if err != nil {
+				errs = append(errs, err)
+				errorCnt++
+				continue
+			}
+			output <- k
+			totalCnt++
+		}
+
+		logger.Printf("got %d files out of %d from API. %d of %d had errors\n",
+			totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt)
+	}
+
+	return errs
+}
+
+func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
+	crawler.CrawlerDocument, error) {
+
+	data, err := gcl.GetFileData(k)
+	if err != nil {
+		return nil, err
+	}
+
+	if err != nil {
+		logger.Printf(
+			"(error: %v) initializing to current time.\n", err)
+	}
+
+	url := gcl.ReposRequest(k.Repository.FullName)
+	defaultBranch, err := gcl.GetDefaultBranch(url)
+	if err != nil {
+		logger.Printf(
+			"(error: %v) setting default_branch to master\n", err)
+		defaultBranch = "master"
+	}
+
+	doc := doc.KustomizationDocument{
+		Document: doc.Document{
+			DocumentData:  string(data),
+			FilePath:      k.Path,
+			DefaultBranch: defaultBranch,
+			RepositoryURL: k.Repository.URL,
+		},
+	}
+
+	return &doc, nil
+}
+
+// ForwardPaginatedQuery follows the links to the next pages and performs all of
+// the queries for a given search query, relaying the data from each request
+// back to an output channel.
+func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string,
+	output chan<- GithubResponseInfo) error {
+
+	logger.Println("querying: ", query)
+	response := gcl.parseGithubResponse(query)
+
+	if response.Error != nil {
+		return response.Error
+	}
+
+	output <- response
+
+	for response.LastURL != "" && response.NextURL != "" {
+		select {
+		case <-ctx.Done():
+			return nil
+		default:
+			response = gcl.parseGithubResponse(response.NextURL)
+			if response.Error != nil {
+				return response.Error
+			}
+
+			output <- response
+		}
+	}
+
+	return nil
+}
+
+// GetFileData gets the bytes from a file.
+func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
+
+	url := gcl.ContentsRequest(k.Repository.FullName, k.Path)
+
+	resp, err := gcl.GetReposData(url)
+	if err != nil {
+		return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v",
+			k, url, err)
+	}
+
+	data, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v",
+			k, url, err)
+	}
+	resp.Body.Close()
+
+	type githubContentRawURL struct {
+		DownloadURL string `json:"download_url,omitempty"`
+	}
+	var rawURL githubContentRawURL
+	err = json.Unmarshal(data, &rawURL)
+	if err != nil {
+		return nil, fmt.Errorf(
+			"%+v: could not get 'download_url' from '%s' response: %v",
+			k, data, err)
+	}
+
+	resp, err = gcl.GetRawUserContent(rawURL.DownloadURL)
+	if err != nil {
+		return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v",
+			k, rawURL.DownloadURL, err)
+	}
+
+	defer resp.Body.Close()
+	data, err = ioutil.ReadAll(resp.Body)
+	return data, err
+}
+
+func (gcl GitHubClient) GetDefaultBranch(url string) (string, error) {
+	resp, err := gcl.GetReposData(url)
+	if err != nil {
+		return "", fmt.Errorf(
+			"'%s' could not get default_branch: %v", url, err)
+	}
+	defer resp.Body.Close()
+	data, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return "", fmt.Errorf(
+			"could not read default_branch: %v", err)
+	}
+
+	type defaultBranch struct {
+		DefaultBranch string `json:"default_branch,omitempty"`
+	}
+	var branch defaultBranch
+	err = json.Unmarshal(data, &branch)
+	if err != nil {
+		return "", fmt.Errorf(
+			"default_branch json malformed: %v", err)
+	}
+
+	return branch.DefaultBranch, nil
+}
+
+// GetFileCreationTime gets the earliest date of a file.
+func (gcl GitHubClient) GetFileCreationTime(
+	k GithubFileSpec) (time.Time, error) {
+
+	url := gcl.CommitsRequest(k.Repository.FullName, k.Path)
+
+	defaultTime := time.Now()
+
+	resp, err := gcl.GetReposData(url)
+	if err != nil {
+		return defaultTime, fmt.Errorf(
+			"%+v: '%s' could not get metadata: %v", k, url, err)
+	}
+
+	type DateSpec struct {
+		Commit struct {
+			Author struct {
+				Date string `json:"date,omitempty"`
+			} `json:"author,omitempty"`
+		} `json:"commit,omitempty"`
+	}
+
+	_, lastURL := parseGithubLinkFormat(resp.Header.Get("link"))
+	if lastURL != "" {
+		resp, err = gcl.GetReposData(lastURL)
+		if err != nil {
+			return defaultTime, fmt.Errorf(
+				"%+v: '%s' could not get metadata: %v",
+				k, lastURL, err)
+		}
+	}
+
+	defer resp.Body.Close()
+	data, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return defaultTime, fmt.Errorf(
+			"%+v: failed to read metadata: %v", k, err)
+	}
+	earliestDate := []DateSpec{}
+	err = json.Unmarshal(data, &earliestDate)
+	size := len(earliestDate)
+	if err != nil || size == 0 {
+		return defaultTime, fmt.Errorf(
+			"%+v: server response '%s' not in expected format: %v",
+			k, data, err)
+	}
+
+	return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date)
+}
+
+// TODO(damienr74) change the tickers to actually check api rate limits, reset
+// times, and throttle requests dynamically based off of current utilization,
+// instead of hardcoding the documented values, these calls are not quota'd.
+// This is now especially important, since caching the API requests will reduce
+// API quota use (so we can actually make more requests in the allotted time
+// period).
+//
+// See https://developer.github.com/v3/rate_limit/ for details.
+var (
+	searchRateTicker  = time.NewTicker(time.Second * 2)
+	contentRateTicker = time.NewTicker(time.Second * 1)
+)
+
+func throttleSearchAPI() {
+	<-searchRateTicker.C
+}
+
+func throttleRepoAPI() {
+	<-contentRateTicker.C
+}
+
+type multiError []error
+
+func (me multiError) Error() string {
+	size := len(me) + 2
+	strs := make([]string, size)
+	strs[0] = "Errors ["
+	for i, err := range me {
+		strs[i+1] = "\t" + err.Error()
+	}
+	strs[size-1] = "]"
+	return strings.Join(strs, "\n")
+}
+
+type GithubFileSpec struct {
+	Path       string `json:"path,omitempty"`
+	Repository struct {
+		API      string `json:"url,omitempty"`
+		URL      string `json:"html_url,omitempty"`
+		FullName string `json:"full_name,omitempty"`
+	} `json:"repository,omitempty"`
+}
+
+type githubResponse struct {
+	// MaxUint is reserved as a sentinel value.
+	// This is the number of files that match the query.
+	TotalCount uint64 `json:"total_count,omitempty"`
+
+	// Github representation of a file.
+	Items []GithubFileSpec `json:"items,omitempty"`
+}
+
+type GithubResponseInfo struct {
+	*http.Response
+	Parsed  *githubResponse
+	Error   error
+	NextURL string
+	LastURL string
+}
+
+func parseGithubLinkFormat(links string) (string, string) {
+	const (
+		linkNext    = "next"
+		linkLast    = "last"
+		linkInfoURL = 1
+		linkInfoRel = 2
+	)
+
+	next, last := "", ""
+	linkInfo := regexp.MustCompile(`<(.*)>.*; rel="(last|next)"`)
+
+	for _, link := range strings.Split(links, ",") {
+		linkParse := linkInfo.FindStringSubmatch(link)
+		if len(linkParse) != 3 {
+			continue
+		}
+
+		url := linkParse[linkInfoURL]
+		switch linkParse[linkInfoRel] {
+		case linkNext:
+			next = url
+		case linkLast:
+			last = url
+		default:
+		}
+	}
+
+	return next, last
+}
+
+func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo {
+	resp, err := gcl.SearchGithubAPI(getRequest)
+	requestInfo := GithubResponseInfo{
+		Response: resp,
+		Error:    err,
+		Parsed:   nil,
+	}
+
+	if err != nil || resp == nil {
+		return requestInfo
+	}
+
+	var data []byte
+	defer resp.Body.Close()
+	data, requestInfo.Error = ioutil.ReadAll(resp.Body)
+	if requestInfo.Error != nil {
+		return requestInfo
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		logger.Println("query: ", getRequest)
+		logger.Println("status not OK at the source")
+		logger.Println("header dump", resp.Header)
+		logger.Println("body dump", string(data))
+		requestInfo.Error = fmt.Errorf("request rejected, status '%s'",
+			resp.Status)
+		return requestInfo
+	}
+
+	requestInfo.NextURL, requestInfo.LastURL =
+		parseGithubLinkFormat(resp.Header.Get("link"))
+
+	resultCount := githubResponse{
+		TotalCount: math.MaxUint64,
+	}
+	requestInfo.Error = json.Unmarshal(data, &resultCount)
+	if requestInfo.Error != nil {
+		return requestInfo
+	}
+
+	requestInfo.Parsed = &resultCount
+
+	return requestInfo
+
+}
+
+// SearchGithubAPI performs a search query and handles rate limitting for
+// the 'code/search?' endpoint as well as timed retries in the case of abuse
+// prevention.
+func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) {
+	throttleSearchAPI()
+	return gcl.getWithRetry(query)
+}
+
+// GetReposData performs a search query and handles rate limitting for
+// the '/repos' endpoint as well as timed retries in the case of abuse
+// prevention.
+func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) {
+	throttleRepoAPI()
+	return gcl.getWithRetry(query)
+}
+
+// User content (file contents) is not API rate limited, so there's no use in
+// throttling this call.
+func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) {
+	return gcl.getWithRetry(query)
+}
+
+func (gcl GitHubClient) getWithRetry(
+	query string) (resp *http.Response, err error) {
+
+	resp, err = gcl.client.Get(query)
+	retryCount := gcl.retryCount
+
+	for err == nil &&
+		resp.StatusCode == http.StatusForbidden &&
+		retryCount > 0 {
+
+		retryTime := resp.Header.Get("Retry-After")
+		i, err := strconv.Atoi(retryTime)
+		if err != nil {
+			return resp, fmt.Errorf(
+				"query '%s' forbidden without 'Retry-After'", query)
+		}
+		logger.Printf(
+			"status forbidden, retring %d more times\n", retryCount)
+
+		logger.Printf("waiting %d seconds before retrying\n", i)
+		time.Sleep(time.Second * time.Duration(i))
+		retryCount--
+		resp, err = gcl.client.Get(query)
+	}
+
+	if err != nil {
+		return resp, fmt.Errorf("query '%s' could not be processed, %v",
+			query, err)
+	}
+
+	return resp, err
+}
--- a/internal/crawl/crawler/github/queries.go
+++ b/internal/crawl/crawler/github/queries.go
@@ -0,0 +1,224 @@
+package github
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+)
+
+const (
+	perPageArg     = "per_page"
+	accessTokenArg = "access_token"
+
+	githubMaxPageSize = 100
+)
+
+// Implementation detail, not important to external API.
+type queryField struct {
+	name  string
+	value interface{}
+}
+
+// Formats a query field.
+func (qf queryField) String() string {
+	var value string
+	switch v := qf.value.(type) {
+	case string:
+		value = v
+	case rangeFormatter:
+		value = v.RangeString()
+	default:
+		value = fmt.Sprint(v)
+	}
+
+	if qf.name == "" {
+		return value
+	}
+	return fmt.Sprint(qf.name, ":", value)
+}
+
+// Example of formating a query:
+// QueryWith(
+//	Filename("kustomization.yaml"),
+//	Filesize(RangeWithin{64, 192}),
+//	Keyword("copyright"),
+//	Keyword("2019"),
+// ).String()
+//
+// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which
+// would search for files that have [64, 192] bytes (inclusive range) and that
+// contain the keywords 'copyright' and '2019' somewhere in the file.
+type Query []queryField
+
+func QueryWith(qfs ...queryField) Query {
+	return Query(qfs)
+}
+
+func (q Query) String() string {
+	strs := make([]string, 0, len(q))
+	for _, elem := range q {
+		str := elem.String()
+		if str == "" {
+			continue
+		}
+		strs = append(strs, str)
+	}
+
+	query := strings.Join(strs, "+")
+	if query == "" {
+		return query
+	}
+	return "q=" + query
+}
+
+// Keyword takes a single word, and formats it according to the Github API.
+func Keyword(k string) queryField {
+	return queryField{value: k}
+}
+
+// Filesize takes a rangeFormatter and formats it according to the Github API.
+func Filesize(r rangeFormatter) queryField {
+	return queryField{name: "size", value: r}
+}
+
+// Filename takes a filename and formats it according to the Github API.
+func Filename(f string) queryField {
+	return queryField{name: "filename", value: f}
+}
+
+// Path takes a filepath and formats it according to the Github API.
+func Path(p string) queryField {
+	return queryField{name: "path", value: p}
+}
+
+// RequestConfig stores common variables that must be present for the queries.
+// - CodeSearchRequests: ask Github to check the code indices given a query.
+// - ContentsRequests: ask Github where to download a resource given a repo and a
+// file path.
+// - CommitsRequests: asks Github to list commits made one a file. Useful to
+// determine the date of a file.
+type RequestConfig struct {
+	perPage     uint64
+	accessToken string
+}
+
+func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
+	return RequestConfig{
+		perPage:     perPage,
+		accessToken: accessToken,
+	}
+}
+
+// CodeSearchRequestWith given a list of query parameters that specify the
+// (patial) query, returns a request object with the (parital) query. Must call
+// the URL method to get the string value of the URL. See request.CopyWith, to
+// understand why the request object is useful.
+func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
+	req := rc.makeRequest("search/code", query)
+	req.vals.Set("sort", "indexed")
+	req.vals.Set("order", "desc")
+	return req
+}
+
+// ContentsRequest given the repo name, and the filepath returns a formatted
+// query for the Github API to find the dowload information of this filepath.
+func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
+	uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path)
+	return rc.makeRequest(uri, Query{}).URL()
+}
+
+func (rc RequestConfig) ReposRequest(fullRepoName string) string {
+	uri := fmt.Sprintf("repos/%s", fullRepoName)
+	return rc.makeRequest(uri, Query{}).URL()
+}
+
+// CommitsRequest given the repo name, and a filepath returns a formatted query
+// for the Github API to find the commits that affect this file.
+func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
+	uri := fmt.Sprintf("repos/%s/commits", fullRepoName)
+	return rc.makeRequest(uri, Query{Path(path)}).URL()
+}
+
+func (rc RequestConfig) makeRequest(path string, query Query) request {
+	vals := url.Values{}
+	if rc.accessToken != "" {
+		vals.Set(accessTokenArg, rc.accessToken)
+	}
+	vals.Set(perPageArg, fmt.Sprint(rc.perPage))
+
+	return request{
+		url: url.URL{
+			Scheme: "https",
+			Host:   "api.github.com",
+			Path:   path,
+		},
+		vals:  vals,
+		query: query,
+	}
+}
+
+type request struct {
+	url   url.URL
+	vals  url.Values
+	query Query
+}
+
+// CopyWith copies the requests and adds the extra query parameters. Usefull
+// for dynamically adding sizes to a filename only query without modifying it.
+func (r request) CopyWith(queryParams ...queryField) request {
+	cpy := r
+	cpy.query = append(cpy.query, queryParams...)
+	return cpy
+}
+
+// URL encodes the variables and the URL representation into a string.
+func (r request) URL() string {
+	// Github does not handle URL encoding properly in its API for the
+	// q='...', so the query parameter is added without any encoding
+	// manually.
+	encoded := r.vals.Encode()
+	query := r.query.String()
+	sep := "&"
+	if query == "" {
+		sep = ""
+	}
+	if encoded == "" && query != "" {
+		sep = "?"
+	}
+	r.url.RawQuery = encoded + sep + query
+	return r.url.String()
+}
+
+// Allows to define a range of numbers and print it in the github range
+// query format https://help.github.com/en/articles/understanding-the-search-syntax.
+type rangeFormatter interface {
+	RangeString() string
+}
+
+// RangeLessThan is a range of values strictly less than (<) size.
+type RangeLessThan struct {
+	size uint64
+}
+
+func (r RangeLessThan) RangeString() string {
+	return fmt.Sprintf("<%d", r.size)
+}
+
+// RangeLessThan is a range of values strictly greater than (>) size.
+type RangeGreaterThan struct {
+	size uint64
+}
+
+func (r RangeGreaterThan) RangeString() string {
+	return fmt.Sprintf(">%d", r.size)
+}
+
+// RangeWithin is an inclusive range from start to end.
+type RangeWithin struct {
+	start uint64
+	end   uint64
+}
+
+func (r RangeWithin) RangeString() string {
+	return fmt.Sprintf("%d..%d", r.start, r.end)
+}
--- a/internal/crawl/crawler/github/queries_test.go
+++ b/internal/crawl/crawler/github/queries_test.go
@@ -0,0 +1,119 @@
+package github
+
+import (
+	"testing"
+)
+
+func TestQueryFields(t *testing.T) {
+	testCases := []struct {
+		formatter queryField
+		expected  string
+	}{
+		{
+			formatter: Keyword("keyword"),
+			expected:  "keyword",
+		},
+		{
+			formatter: Filesize(RangeLessThan{23}),
+			expected:  "size:<23",
+		},
+		{
+			formatter: Filesize(RangeWithin{24, 64}),
+			expected:  "size:24..64",
+		},
+		{
+			formatter: Filesize(RangeGreaterThan{64}),
+			expected:  "size:>64",
+		},
+		{
+			formatter: Path("some/path/to/file"),
+			expected:  "path:some/path/to/file",
+		},
+		{
+			formatter: Filename("kustomization.yaml"),
+			expected:  "filename:kustomization.yaml",
+		},
+	}
+
+	for _, test := range testCases {
+		if result := test.formatter.String(); result != test.expected {
+			t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected)
+		}
+	}
+}
+
+func TestQueryType(t *testing.T) {
+	testCases := []struct {
+		query    Query
+		expected string
+	}{
+		{
+			query: QueryWith(
+				Filesize(RangeWithin{24, 64}),
+				Filename("kustomization.yaml"),
+				Keyword("keyword1"),
+				Keyword("keyword2"),
+			),
+			expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
+		},
+	}
+
+	for _, test := range testCases {
+		if queryStr := test.query.String(); queryStr != test.expected {
+			t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected)
+		}
+
+	}
+}
+
+func TestGithubSearchQuery(t *testing.T) {
+	const (
+		accessToken = "random_token"
+		perPage     = 100
+	)
+
+	testCases := []struct {
+		rc                    RequestConfig
+		codeQuery             Query
+		fullRepoName          string
+		path                  string
+		expectedCodeQuery     string
+		expectedContentsQuery string
+		expectedCommitsQuery  string
+	}{
+		{
+			rc: RequestConfig{
+				perPage:     perPage,
+				accessToken: accessToken,
+			},
+			codeQuery: Query{
+				Filename("kustomization.yaml"),
+				Filesize(RangeWithin{64, 128}),
+			},
+			fullRepoName: "kubernetes-sigs/kustomize",
+			path:         "examples/helloWorld/kustomization.yaml",
+
+			expectedCodeQuery: "https://api.github.com/search/code?" +
+				"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
+
+			expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
+				"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
+
+			expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
+				"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
+		},
+	}
+
+	for _, test := range testCases {
+		if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery {
+			t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery)
+		}
+
+		if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery {
+			t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery)
+		}
+		if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery {
+			t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery)
+		}
+	}
+}
--- a/internal/crawl/crawler/github/split_search_ranges.go
+++ b/internal/crawl/crawler/github/split_search_ranges.go
@@ -0,0 +1,378 @@
+package github
+
+// GitHub only returns at most 1000 results per search query,
+// this is problematic if you want to retrieve all the results for a given
+// search query. However, GitHub allows you to specify as much as you want per
+// query to make things more specific. Specifically for files, GitHub allows
+// you to specify their sizes with range queries. This is very convenient
+// since it allows us to split the search into disjoint sets/shards of results
+// from the different file size ranges.
+//
+// Some important factors to consider:
+//
+// -  These queries are rate limited by the API to roughly once query every two
+//    seconds.
+//
+// -  The search space for file sizes is in bytes, from 0B to < 512KiB (this is
+//    a huge search space that cannot be probed linearly in a timely manner if
+//    granularity is to be expected).
+//
+// -  If you have K files there will likely be ~K/1000 sets that you have find
+//    from this search space in order to get all of the results.
+//
+// -  If you have O(K) sets it is unlikely that they are all of the same size,
+//    since (most files are power law distributed). That means that the range
+//    might be significantly smaller for 1000 small files, than it is for
+//    1000 large files.
+//
+// -  This method is a best effort approach. There are some limitations to what
+//    it can and can't do, so please note the following:
+//
+//    +  There may very well be a filesize that has more than 1000 results.
+//       this method cannot help in this case. However, requerying over time
+//       (days/weeks/months) while sorting by last indexed values may be
+//       sufficient to eventually get all of the results.
+//
+//    +  It's possible that the github API returns inconsistent counts. This
+//       is problematic in most cases, since it can cause many issues if the
+//       case is not handled properly. For instance, if you requested the
+//       number of files of an interval from size:0..64 and get that there
+//       are 900 results, you may query at size:0..96 and get that there
+//       are 800 results. To guarantee that this approach completes and does
+//       not get into a query loop over the same intervals, it will retry a few
+//       times and take the largest of the results or the largest previously
+//       queried value from another range (in this case, the implementation
+//       could decide that size:0..96 must have 900) results. This makes the
+//       approach best effort even if there are no single file sizes of over
+//       1000 results.
+//
+//
+// The approach that was taken to solve this problem is the following:
+//
+// 1. Determine the total number of results by querying from the lower bound
+//    to the upper bound (size:0..max). If there are less than 1000 files,
+//    return a single range of values (size:0..max) since all results can be
+//    retrieved.
+//
+// 2. Otherwise, set a target number of files to be 1000.
+//
+// 3. Binary search for the range from 0..r that provides a file count that is
+//    less than or equal to the target. Once this value is found, store the
+//    upper bound of range (r). If r is the same as the previous value, (or 0)
+//    increase r by one (this guarantees progress, but will miss out on some
+//    results).
+//
+// 4. Increase the target by 1000.
+//
+// 5. Repeat steps 3 and 4 until the target is at or exceeds the total number
+//    of files.
+//
+//
+// In general there are other ways to get all of the files from GitHub. In
+// some cases it would be sufficient to just get the files that are being
+// updated/indexed by github periodically to update the corpus, so this
+// complicated approach does not have to be run every time. However, for
+// some searches, there may be too many results on a time interval to do
+// this simple update search limited to only 1000 results.
+//
+// There is also a more sophisticated approach that may yield better
+// performance:
+// -  Perform this search once and create a prior distribution of file sizes.
+//    Each time you want to retrieve the results of the query, scale the
+//    prior of expected ranges to the current number of files. From each
+//    expected range of 1000 files, perform a exponential search to find the
+//    lower bound of the range. This would likely reduce the total number
+//    of queries by a significant amount since it would only have to search
+//    for a small set of values around each likely range boundary.
+//
+// However, actually retrieving the files will be the bottleneck operation
+// since the number of queries to find the ranges will be close to:
+//   log2(maxFileSize) * totalResults / 1000 ~= totalResults / 50
+// whereas the number of queries to actually get all of the search results
+// are close to:
+//   apiCallsPerResult * 10(pages) * 100(resultsPerPage) * totalResults / 1000
+//   = apiCallsPerResult * totalResults.
+//
+// So it could very well take apiCallsPerResult * 50 times longer to acutally
+// fetch the results (assuming the quotas for the API calls are the same as the
+// search API), than it does to perform these range searches.
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+// Files cannot be more than 2^19 bytes, according to
+// https://help.github.com/en/articles/searching-code#considerations-for-code-search
+const (
+	githubMaxFileSize        = uint64(1 << 19)
+	githubMaxResultsPerQuery = uint64(1000)
+)
+
+// Interface instead of struct for testing purposes.
+// Not expecting to have multiple implementations.
+type cachedSearch interface {
+	CountResults(uint64) (uint64, error)
+	RequestString(filesize rangeFormatter) string
+}
+
+// cachedSearch is a simple data structure that maps the upper bound (r) of a
+// range from 0 to r to the number of files that have between 0 and r files
+// (inclusive). It also guarantees that the counts are monotonically increasing
+// (not strict) as the value for r increases, by looking at the maximal
+// previous file count for the value that precedes r in the cache.
+//
+// It uses a bit trick to be more efficient in detecting
+// inconsistencies in the returned data from the Github API.
+// Therefore, the cache expects a search to always start at 0, and
+// it expects the max file size to be a power of 2. If this is to be changed
+// there are a few considerations to keep in mind:
+//
+// 1. The cache is only efficient if the queries can be reused, so if
+//    the first chunk of files lives in the range 0..x, continuing the
+//    search for the next chunk from x+1..max (while asymptotically sane)
+//    may actually be less efficient since the cache is essentially reset
+//    at every interval. This leads to a larger number of requests in
+//    practice, and requests are what's expensive (rate limits).
+//
+// 2. The github API is not perfectly monotonic.. (this is somewhat
+//    problematic). The current cache implementation looks at the
+//    predecessor entry to find out if the current value is monotonic.
+//    This is where the bit trick is used, since each step in the binary
+//    search is adding or ommiting to add a decreasing power of 2 to the query
+//    value, we can remove the least significant set bit to find the
+//    predecessor in constant time. Ultimately since the search is rate
+//    limited, we could also easily afford to compute this in linear time
+//    by iterating over cached values. So this trick is not crucial to the
+//    cache's performance.
+type githubCachedSearch struct {
+	cache       map[uint64]uint64
+	gcl         GitHubClient
+	baseRequest request
+}
+
+func newCache(client GitHubClient, query Query) githubCachedSearch {
+	return githubCachedSearch{
+		cache: map[uint64]uint64{
+			0: 0,
+		},
+		gcl:         client,
+		baseRequest: client.CodeSearchRequestWith(query),
+	}
+}
+
+func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
+	count, cached := c.cache[upperBound]
+	if cached {
+		return count, nil
+	}
+
+	sizeRange := RangeWithin{0, upperBound}
+	rangeRequest := c.RequestString(sizeRange)
+
+	result := c.gcl.parseGithubResponse(rangeRequest)
+	if result.Error != nil {
+		return count, result.Error
+	}
+
+	// As range search uses powers of 2 for binary search, the previously
+	// cached value is easy to find by removing the least significant set
+	// bit from the current upperBound, since each step of the search adds
+	// least significant set bit.
+	//
+	// Finding the predecessor could also be implemented by iterating over
+	// the map to find the largest key that is smaller than upperBound if
+	// this approach deemed too complex.
+	trail := bits.TrailingZeros64(upperBound)
+	prev := uint64(0)
+	if trail != 64 {
+		prev = upperBound - (1 << uint64(trail))
+	}
+
+	// Sometimes the github API is not monotonically increasing, or ouputs
+	// an erroneous value of 0, or 1. This logic makes sure that it was not
+	// erroneous, and that the sequence continues to be monotonic by setting
+	// the current query count to match the previous value. which at least
+	// guarantees that the range search terminates.
+	//
+	// On the other hand, if files are added, then we way loose out on some
+	// files in a reviously completed range, but these files should be there
+	// the next time the crawler runs, so this is not really problematic.
+	retryMonotonicCount := 4
+	for result.Parsed.TotalCount < c.cache[prev] {
+		logger.Printf(
+			"Retrying query... current lower bound: %d, got: %d\n",
+			c.cache[prev], result.Parsed.TotalCount)
+
+		result = c.gcl.parseGithubResponse(rangeRequest)
+		if result.Error != nil {
+			return count, result.Error
+		}
+
+		retryMonotonicCount--
+		if retryMonotonicCount <= 0 {
+			result.Parsed.TotalCount = c.cache[prev]
+			logger.Println(
+				"Retries for monotonic check exceeded,",
+				" setting value to match predecessor")
+		}
+	}
+
+	count = result.Parsed.TotalCount
+	logger.Printf("Caching new query %s, with count %d\n",
+		sizeRange.RangeString(), count)
+	c.cache[upperBound] = count
+	return count, nil
+}
+
+func (c githubCachedSearch) RequestString(filesize rangeFormatter) string {
+	return c.baseRequest.CopyWith(Filesize(filesize)).URL()
+}
+
+// Outputs a (possibly incomplete) list of ranges to query to find most search
+// results as permissible by the search github search API. Github search only
+// allows 1,000 results per query (paginated).
+// Source: https://developer.github.com/v3/search/
+//
+// This leaves the possibility of having file sizes with more than 1000 results,
+// This would mean that the search as it is could not find all files. If queries
+// are sorted by last indexed, and retrieved on regular intervals, it should be
+// sufficient to get most if not all documents.
+func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
+	totalFiles, err := cache.CountResults(githubMaxFileSize)
+	if err != nil {
+		return nil, err
+	}
+	logger.Println("total files: ", totalFiles)
+
+	if githubMaxResultsPerQuery >= totalFiles {
+		return []string{
+			cache.RequestString(RangeWithin{0, githubMaxFileSize}),
+		}, nil
+	}
+
+	// Find all the ranges of file sizes such that all files are queryable
+	// using the Github API. This does not compute an optimal ranges, since
+	// the number of queries needed to get the information required to
+	// compute an optimal range is expected to be much larger than the
+	// number of queries performed this way.
+	//
+	// The number of ranges is k = (number of files)/1000, and finding a
+	// range is logarithmic in the max file size (n = filesize). This means
+	// that preprocessing takes O(k * lg n) queries to find the ranges with
+	// a binary search over file sizes.
+	//
+	// My intuition is that this approach is competitive to a perfectly
+	// optimal solution, but I didn't actually take the time to do a
+	// rigorous proof. Intuitively, since files sizes are typically power
+	// law distibuted the binary search will be very skewed towards the
+	// smaller file ranges. This means that in practice this approach will
+	// make fewer than (#files/1000)*(log(n) = 19) queries for
+	// preprocessing, since it reuses a lot of the queries in the denser
+	// ranges. Furthermore, because of the distribution, it should be very
+	// easy to find ranges that are very close to the upper bound, up to
+	// the limiting factor of having no more than 1000 files accessible per
+	// range.
+	filesAccessible := uint64(0)
+	sizes := make([]uint64, 0)
+	for filesAccessible < totalFiles {
+		target := filesAccessible + githubMaxResultsPerQuery
+		if target >= totalFiles {
+			break
+		}
+
+		logger.Printf("%d accessible files, next target = %d\n",
+			filesAccessible, target)
+
+		cur, err := lowerBoundFileCount(cache, target)
+		if err != nil {
+			return nil, err
+		}
+
+		// If there are more than 1000 files in the next bucket, we must
+		// advance anyway and lose out on some files :(.
+		if l := len(sizes); l > 0 && sizes[l-1] == cur {
+			cur++
+		}
+
+		nextAccessible, err := cache.CountResults(cur)
+		if err != nil {
+			return nil, fmt.Errorf(
+				"cache should be populated at %d already, got %v",
+				cur, err)
+		}
+		if nextAccessible < filesAccessible {
+			return nil, fmt.Errorf(
+				"number of results dropped from %d to %d within range search",
+				filesAccessible, nextAccessible)
+		}
+
+		filesAccessible = nextAccessible
+		if nextAccessible < totalFiles {
+			sizes = append(sizes, cur)
+		}
+	}
+
+	return formatFilesizeRanges(cache, sizes), nil
+}
+
+// lowerBoundFileCount finds the filesize range from [0, return value] that has
+// the largest file count that is smaller than or equal to
+// githubMaxResultsPerQuery. It is important to note that this returned value
+// could already be in a previous range if the next file size has more than 1000
+// results. It is left to the caller to handle this bit of logic and guarantee
+// forward progession in this case.
+func lowerBoundFileCount(
+	cache cachedSearch, targetFileCount uint64) (uint64, error) {
+
+	// Binary search for file sizes that make up the next <=1000 element
+	// chunk.
+	cur := uint64(0)
+	increase := githubMaxFileSize / 2
+
+	for increase > 0 {
+		mid := cur + increase
+
+		count, err := cache.CountResults(mid)
+		if err != nil {
+			return count, err
+		}
+
+		if count <= targetFileCount {
+			cur = mid
+		}
+
+		if count == targetFileCount {
+			break
+		}
+
+		increase /= 2
+	}
+
+	return cur, nil
+}
+
+func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string {
+	ranges := make([]string, 0, len(sizes)+1)
+
+	if len(sizes) > 0 {
+		ranges = append(ranges, cache.RequestString(
+			RangeLessThan{sizes[0] + 1},
+		))
+	}
+
+	for i := 0; i < len(sizes)-1; i += 1 {
+		ranges = append(ranges, cache.RequestString(
+			RangeWithin{sizes[i] + 1, sizes[i+1]},
+		))
+
+		if i != len(sizes)-2 {
+			continue
+		}
+		ranges = append(ranges, cache.RequestString(
+			RangeGreaterThan{sizes[i+1]},
+		))
+	}
+
+	return ranges
+}
--- a/internal/crawl/crawler/github/split_search_ranges_test.go
+++ b/internal/crawl/crawler/github/split_search_ranges_test.go
@@ -0,0 +1,90 @@
+package github
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+type testCachedSearch struct {
+	cache map[uint64]uint64
+}
+
+func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
+	fmt.Printf("CountResults(%05x)\n", upperBound)
+	count, ok := c.cache[upperBound]
+	if !ok {
+		return count, fmt.Errorf("cache not set at %x", upperBound)
+	}
+	return count, nil
+}
+
+func (c testCachedSearch) RequestString(filesize rangeFormatter) string {
+	return filesize.RangeString()
+}
+
+// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test
+// cache take in a list of (filesize, count) pairs and it can populate the cache
+// without relying on how the implementation will create queries. This was only
+// a quick and dirty test to make sure that modifications are not going to break
+// the functionality.
+func TestRangeSplitting(t *testing.T) {
+	// Keys follow the binary search depending on whether or not the range
+	// is too small/large to find close to optimal filesize ranges. This
+	// test is heavily tied to the fact that the search is using powers of two
+	// to make progress in the search (hence the use of hexadecimal values).
+	cache := testCachedSearch{
+		map[uint64]uint64{
+			0x80000: 5000,
+			0x40000: 5000,
+			0x20000: 5000,
+			0x10000: 5000,
+			0x08000: 5000,
+			0x04000: 5000,
+			0x02000: 5000,
+			0x01000: 5000,
+			0x00fff: 3950,
+			0x00ffe: 3950,
+			0x00ffc: 3950,
+			0x00ff8: 3950,
+			0x00ff0: 3950,
+			0x00fe0: 3950,
+			0x00fc0: 3950,
+			0x00f80: 3950,
+			0x00f00: 3950,
+			0x00e00: 3950,
+			0x00c00: 3950,
+			0x00800: 3950,
+			0x00400: 3950,
+			0x00200: 3688,
+			0x00180: 3028,
+			0x00100: 2999,
+			0x000c0: 2448,
+			0x00080: 1999,
+			0x00070: 1600,
+			0x0006c: 1003,
+			0x0006b: 1001,
+			0x0006a: 999,
+			0x00068: 999,
+			0x00060: 999,
+			0x00040: 999,
+			0x00000: 0,
+		},
+	}
+
+	requests, err := FindRangesForRepoSearch(cache)
+	if err != nil {
+		t.Errorf("Error while finding ranges: %v", err)
+	}
+	expected := []string{
+		"<107",      // cache.RequestString(RangeLessThan{0x6b}),
+		"107..128",  // cache.RequestString(RangeWithin{0x6b, 0x80}),
+		"129..256",  // cache.RequestString(RangeWithin{0x81, 0x100}),
+		"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
+		">4095",     // cache.RequestString(RangeGreaterThan{0xfff}),
+	}
+
+	if !reflect.DeepEqual(requests, expected) {
+		t.Errorf("Expected requests (%v) to equal (%v)", requests, expected)
+	}
+}