Crawler performance improvements, better structure

Refactored the crawler implementation to make the whole thing more testable. Added a document interface to make the crawler generic. This will be useful for collecting plugins, and other documents.
2026-06-11 00:52:55 +00:00 · 2019-08-23 18:11:55 -07:00
parent a66808a10d
commit c2d6f09ef3
13 changed files with 1251 additions and 99 deletions
--- a/internal/tools/crawler/crawler.go
+++ b/internal/tools/crawler/crawler.go
@@ -6,11 +6,19 @@ package crawler
 import (
 	"context"
 	"fmt"
+	"log"
+	"os"
 	"sync"

+	_ "github.com/gomodule/redigo/redis"
+
 	"sigs.k8s.io/kustomize/internal/tools/doc"
 )

+var (
+	logger = log.New(os.Stdout, "Crawler: ", log.LstdFlags|log.LUTC|log.Llongfile)
+)
+
 // Crawler forwards documents from source repositories to index and store them
 // for searching. Each crawler is responsible for querying it's source of
 // information, and forwarding files that have not been seen before or that need
@@ -19,7 +27,152 @@ type Crawler interface {
 	// Crawl returns when it is done processing. This method does not take
 	// ownership of the channel. The channel is write only, and it
 	// designates where the crawler should forward the documents.
-	Crawl(ctx context.Context, output chan<- *doc.KustomizationDocument) error
+	Crawl(ctx context.Context, output chan<- CrawlerDocument) error
+
+	// Get the document data given the FilePath, Repo, and Ref/Tag/Branch.
+	FetchDocument(context.Context, *doc.Document) error
+	// Write to the document what the created time is.
+	SetCreated(context.Context, *doc.Document) error
+
+	Match(*doc.Document) bool
+}
+
+type CrawlerDocument interface {
+	ID() string
+	GetDocument() *doc.Document
+	GetResources() ([]*doc.Document, error)
+	WasCached() bool
+}
+
+type CrawlerSeed []*doc.Document
+
+type IndexFunc func(CrawlerDocument, Crawler) error
+type Converter func(*doc.Document) (CrawlerDocument, error)
+
+// Cleaner, more efficient, and more extensible crawler implementation.
+// The seed must include the ids of each document in the index.
+func CrawlFromSeed(ctx context.Context, seed CrawlerSeed,
+	crawlers []Crawler, conv Converter, indx IndexFunc) {
+
+	seen := make(map[string]struct{})
+
+	logIfErr := func(err error) {
+		if err == nil {
+			return
+		}
+		logger.Println("error: ", err)
+	}
+
+	stack := make(CrawlerSeed, 0)
+
+	findMatch := func(d *doc.Document) Crawler {
+		for _, crawl := range crawlers {
+			if crawl.Match(d) {
+				return crawl
+			}
+		}
+
+		return nil
+	}
+
+	addBranches := func(cdoc CrawlerDocument, match Crawler) {
+		if _, ok := seen[cdoc.ID()]; ok {
+			return
+		}
+
+		seen[cdoc.ID()] = struct{}{}
+		// Insert into index
+		err := indx(cdoc, match)
+		logIfErr(err)
+		if err != nil {
+			return
+		}
+
+		deps, err := cdoc.GetResources()
+		logIfErr(err)
+		if err != nil {
+			return
+		}
+		for _, dep := range deps {
+			if _, ok := seen[dep.ID()]; ok {
+				continue
+			}
+			stack = append(stack, dep)
+		}
+	}
+
+	doCrawl := func(docsPtr *CrawlerSeed) {
+		for len(*docsPtr) > 0 {
+			back := len(*docsPtr) - 1
+			next := (*docsPtr)[back]
+			*docsPtr = (*docsPtr)[:back]
+
+			match := findMatch(next)
+			if match == nil {
+				logIfErr(fmt.Errorf(
+					"%v could not match any crawler", next))
+				continue
+			}
+
+			err := match.FetchDocument(ctx, next)
+			logIfErr(err)
+			// If there was no change or there is an error, we don't have
+			// to branch out, since the dependencies are already in the
+			// index, or we cannot find the document.
+			if err != nil || next.WasCached() {
+				continue
+			}
+
+			cdoc, err := conv(next)
+			logIfErr(err)
+			if err != nil {
+				continue
+			}
+
+			addBranches(cdoc, match)
+		}
+	}
+	// Exploit seed to update bulk of corpus.
+	logger.Printf("updating %d documents from seed\n", len(seed))
+	doCrawl(&seed)
+	// Traverse any new links added while updating corpus.
+	logger.Printf("crawling %d new documents found in the seed\n", len(stack))
+	doCrawl(&stack)
+
+	ch := make(chan CrawlerDocument, 1<<10)
+	wg := sync.WaitGroup{}
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for cdoc := range ch {
+			if _, ok := seen[cdoc.ID()]; ok {
+				continue
+			}
+			match := findMatch(cdoc.GetDocument())
+			if match == nil {
+				logIfErr(fmt.Errorf(
+					"%v could not match any crawler", cdoc))
+				continue
+			}
+			addBranches(cdoc, match)
+		}
+	}()
+
+	// Exploration through APIs.
+	errs := CrawlerRunner(ctx, ch, crawlers)
+	if errs != nil {
+		for _, err := range errs {
+			logIfErr(err)
+		}
+	}
+	close(ch)
+	logger.Println("Processing the new documents from the crawlers' exploration.")
+	wg.Wait()
+	// Handle deps of newly discovered documents.
+	logger.Printf("crawling the %d new documents from the crawlers' exploration.",
+		len(stack))
+	doCrawl(&stack)
 }

 // CrawlerRunner is a blocking function and only returns once all of the
@@ -32,8 +185,15 @@ type Crawler interface {
 // The return value is an array of errors in which each index represents the
 // index of the crawler that emitted the error. Although the errors themselves
 // can be nil, the array will always be exactly the size of the crawlers array.
+//
+// Crawler Runner takes in a seed, which represents the documents stored in an
+// index somewhere. The document data is not required to be populated. If there
+// are many documents, this is preferable. The order of iteration over the seed
+// is not garanteed, but the CrawlerRunner does guarantee that every element
+// from the seed will be processed before any other documents from the
+// crawlers.
 func CrawlerRunner(ctx context.Context,
-	output chan<- *doc.KustomizationDocument, crawlers []Crawler) []error {
+	output chan<- CrawlerDocument, crawlers []Crawler) []error {

 	errs := make([]error, len(crawlers))
 	wg := sync.WaitGroup{}
@@ -41,12 +201,12 @@ func CrawlerRunner(ctx context.Context,
 	for i, crawler := range crawlers {
 		// Crawler implementations get their own channels to prevent a
 		// crawler from closing the main output channel.
-		docs := make(chan *doc.KustomizationDocument)
+		docs := make(chan CrawlerDocument)
 		wg.Add(2)

 		// Forward all of the documents from this crawler's channel to
 		// the main output channel.
-		go func(docs <-chan *doc.KustomizationDocument) {
+		go func(docs <-chan CrawlerDocument) {
 			defer wg.Done()
 			for doc := range docs {
 				output <- doc
@@ -55,7 +215,7 @@ func CrawlerRunner(ctx context.Context,

 		// Run this crawler and capture its returned error.
 		go func(idx int, crawler Crawler,
-			docs chan<- *doc.KustomizationDocument) {
+			docs chan<- CrawlerDocument) {

 			defer func() {
 				wg.Done()
--- a/internal/tools/crawler/crawler_test.go
+++ b/internal/tools/crawler/crawler_test.go
@@ -3,26 +3,88 @@ package crawler
 import (
 	"context"
 	"errors"
+	"fmt"
 	"reflect"
 	"sort"
+	"strings"
 	"sync"
 	"testing"
+	"time"

 	"sigs.k8s.io/kustomize/internal/tools/doc"
+	"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
+)
+
+const (
+	kustomizeRepo = "https://github.com/kubernetes-sigs/kustomize"
 )

 // Simple crawler that forwards it's list of documents to a provided channel and
 // returns it's error to the caller.
 type testCrawler struct {
-	docs []doc.KustomizationDocument
-	err  error
+	matchPrefix string
+	err         error
+	docs        []doc.KustomizationDocument
+	lukp        map[string]int
+}
+
+func (c testCrawler) Match(d *doc.Document) bool {
+	return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix)
+}
+
+func (c testCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
+	if i, ok := c.lukp[d.ID()]; ok {
+		d.DocumentData = c.docs[i].DocumentData
+		return nil
+	}
+	for _, suffix := range pgmconfig.KustomizationFileNames {
+		fmt.Println(d.ID(), "/", suffix)
+		i, ok := c.lukp[d.ID()+"/"+suffix]
+		if !ok {
+			continue
+		}
+		d.FilePath += "/" + suffix
+		d.DocumentData = c.docs[i].DocumentData
+		return nil
+	}
+	return fmt.Errorf("Document %v does not exist for matcher: %s",
+		d, c.matchPrefix)
+}
+
+func (c testCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
+	d.CreationTime = &time.Time{}
+	return nil
+}
+
+func newCrawler(matchPrefix string, err error,
+	docs []doc.KustomizationDocument) testCrawler {
+	c := testCrawler{
+		matchPrefix: matchPrefix,
+		err:         err,
+		docs:        docs,
+		lukp:        make(map[string]int),
+	}
+	for i, d := range docs {
+		c.lukp[d.ID()] = i
+	}
+	return c
 }

 // Crawl implements the Crawler interface for testing.
 func (c testCrawler) Crawl(ctx context.Context,
-	output chan<- *doc.KustomizationDocument) error {
+	output chan<- CrawlerDocument) error {

-	for i := range c.docs {
+	for i, d := range c.docs {
+		isResource := true
+		for _, suffix := range pgmconfig.KustomizationFileNames {
+			if strings.HasSuffix(d.FilePath, suffix) {
+				isResource = false
+				break
+			}
+		}
+		if isResource {
+			continue
+		}
 		output <- &c.docs[i]
 	}
 	return c.err
@@ -45,6 +107,7 @@ func (s sortableDocs) Len() int {
 }

 func TestCrawlerRunner(t *testing.T) {
+	fmt.Println("testing CrawlerRunner")
 	tests := []struct {
 		tc   []Crawler
 		errs []error
@@ -54,17 +117,27 @@ func TestCrawlerRunner(t *testing.T) {
 			tc: []Crawler{
 				testCrawler{
 					docs: []doc.KustomizationDocument{
-						{FilePath: "crawler1/doc1"},
-						{FilePath: "crawler1/doc2"},
-						{FilePath: "crawler1/doc3"},
+						{Document: doc.Document{
+							FilePath: "crawler1/doc1/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler1/doc2/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler1/doc3/kustomization.yaml",
+						}},
 					},
 				},
 				testCrawler{err: errors.New("crawler2")},
 				testCrawler{},
 				testCrawler{
 					docs: []doc.KustomizationDocument{
-						{FilePath: "crawler4/doc1"},
-						{FilePath: "crawler4/doc2"},
+						{Document: doc.Document{
+							FilePath: "crawler4/doc1/kustomization.yaml",
+						}},
+						{Document: doc.Document{
+							FilePath: "crawler4/doc2/kustomization.yaml",
+						}},
 					},
 					err: errors.New("crawler4"),
 				},
@@ -76,17 +149,27 @@ func TestCrawlerRunner(t *testing.T) {
 				errors.New("crawler4"),
 			},
 			docs: sortableDocs{
-				{FilePath: "crawler1/doc1"},
-				{FilePath: "crawler1/doc2"},
-				{FilePath: "crawler1/doc3"},
-				{FilePath: "crawler4/doc1"},
-				{FilePath: "crawler4/doc2"},
+				{Document: doc.Document{
+					FilePath: "crawler1/doc1/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler1/doc2/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler1/doc3/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler4/doc1/kustomization.yaml",
+				}},
+				{Document: doc.Document{
+					FilePath: "crawler4/doc2/kustomization.yaml",
+				}},
 			},
 		},
 	}

 	for _, test := range tests {
-		output := make(chan *doc.KustomizationDocument)
+		output := make(chan CrawlerDocument)
 		wg := sync.WaitGroup{}
 		wg.Add(1)

@@ -95,8 +178,8 @@ func TestCrawlerRunner(t *testing.T) {
 			defer close(output)
 			defer wg.Done()

-			errs := CrawlerRunner(context.Background(), output,
-				test.tc)
+			errs := CrawlerRunner(context.Background(),
+				output, test.tc)

 			// Check that errors are returned as they should be.
 			if !reflect.DeepEqual(errs, test.errs) {
@@ -108,8 +191,13 @@ func TestCrawlerRunner(t *testing.T) {

 		// Iterate over the output channel of Crawler runner.
 		returned := make(sortableDocs, 0, len(test.docs))
-		for doc := range output {
-			returned = append(returned, *doc)
+		for o := range output {
+			d, ok := o.(*doc.KustomizationDocument)
+			if !ok || d == nil {
+				t.Errorf("%T not expected type (%T)",
+					o, d)
+			}
+			returned = append(returned, *d)
 		}

 		// Check that all documents are received.
@@ -122,3 +210,147 @@ func TestCrawlerRunner(t *testing.T) {
 		wg.Wait()
 	}
 }
+
+func TestCrawlFromSeed(t *testing.T) {
+	fmt.Println("testing CrawlFromSeed")
+
+	tests := []struct {
+		seed    CrawlerSeed
+		matcher string
+		corpus  []doc.KustomizationDocument
+	}{
+		{
+			seed: CrawlerSeed{
+				{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/kustomization.yaml",
+				},
+				{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/kustomization.yaml",
+				},
+			},
+			matcher: kustomizeRepo,
+			corpus: []doc.KustomizationDocument{
+				// Visited from the seed, will be ignored in the crawl.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/kustomization.yaml",
+					DocumentData: `
+resources:
+- deployment.yaml
+`,
+				}},
+				// Also visited from the seed as a relative resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/helloWorld/deployment.yaml",
+					DocumentData: `
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hello
+`,
+				}},
+				// Visited from the seed. Has a remote import.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/kustomization.yaml",
+					DocumentData: `
+resources:
+- https://github.com/kubernetes-sigs/kustomize/examples/other/overlay
+- service.yaml
+`,
+				}},
+				// Imported as a base from the seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/overlay/kustomization.yaml",
+					DocumentData: `
+resources:
+- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl1
+- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl2
+`,
+				}},
+				// Imported as a resource from the seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/service.yaml",
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl1/kustomization.yml",
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl2/kustomization.yaml",
+					DocumentData: `
+resources:
+- ../base
+- job.yaml
+`,
+				}},
+				// Visited from crawling seed.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/base/kustomization.yml",
+				}},
+				// Visited from crawling seed imported as resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/seedcrawl2/job.yaml",
+				}},
+				// Visited from the crawler runner.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/base/kustomization.yaml",
+					DocumentData: `
+resources:
+- ../app
+`,
+				}},
+				// Visited from the crawler runner.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/app/kustomization.yaml",
+					DocumentData: `
+resources:
+- resource.yaml
+`,
+				}},
+				// Visited from crawling runner imported as resource.
+				{Document: doc.Document{
+					RepositoryURL: kustomizeRepo,
+					FilePath:      "examples/other/app/resource.yaml",
+				}},
+			},
+		},
+	}
+
+	for _, tc := range tests {
+		cr := newCrawler(tc.matcher, nil, tc.corpus)
+		visited := make(map[string]int)
+		CrawlFromSeed(context.Background(), tc.seed, []Crawler{cr},
+			func(d *doc.Document) (CrawlerDocument, error) {
+				return &doc.KustomizationDocument{
+					Document: *d,
+				}, nil
+			},
+			func(d CrawlerDocument, cr Crawler) error {
+				visited[d.ID()]++
+				return nil
+			},
+		)
+		if lv, lc := len(visited), len(tc.corpus); lv != lc {
+			t.Errorf("error: %d of %d documents visited.", lv, lc)
+			t.Errorf("\nvisited (%v)\nexpected (%v).", visited, cr.lukp)
+		}
+		for id, cnt := range visited {
+			if cnt != 1 {
+				t.Errorf("%s not visited once (%d)", id, cnt)
+			}
+		}
+	}
+}
--- a/internal/tools/crawler/github/crawler.go
+++ b/internal/tools/crawler/github/crawler.go
@@ -16,7 +16,11 @@ import (
 	"strings"
 	"time"

+	"sigs.k8s.io/kustomize/internal/tools/crawler"
 	"sigs.k8s.io/kustomize/internal/tools/doc"
+	"sigs.k8s.io/kustomize/internal/tools/httpclient"
+	"sigs.k8s.io/kustomize/v3/pkg/git"
+	"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
 )

 var logger = log.New(os.Stdout, "Github Crawler: ",
@@ -34,6 +38,17 @@ type GitHubClient struct {
 	client     *http.Client
 }

+func NewClient(accessToken string, retryCount uint64, client *http.Client) GitHubClient {
+	return GitHubClient{
+		retryCount: retryCount,
+		client:     client,
+		RequestConfig: RequestConfig{
+			perPage:     githubMaxPageSize,
+			accessToken: accessToken,
+		},
+	}
+}
+
 func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
 	query Query) githubCrawler {

@@ -52,7 +67,7 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,

 // Implements crawler.Crawler.
 func (gc githubCrawler) Crawl(
-	ctx context.Context, output chan<- *doc.KustomizationDocument) error {
+	ctx context.Context, output chan<- crawler.CrawlerDocument) error {

 	noETagClient := GitHubClient{
 		RequestConfig: gc.client.RequestConfig,
@@ -80,13 +95,78 @@ func (gc githubCrawler) Crawl(
 		}
 	}

-	return errs
+	if len(errs) > 0 {
+		return errs
+	}
+
+	return nil
+}
+
+func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
+	repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
+	repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
+	if err != nil {
+		return fmt.Errorf("invalid repospec: %v", err)
+	}
+
+	url := "https://raw.githubusercontent.com/" + repoSpec.OrgRepo +
+		"/" + repoSpec.Ref + "/" + repoSpec.Path
+
+	handle := func(resp *http.Response, err error, path string) error {
+		if err == nil && resp.StatusCode == http.StatusOK {
+			d.IsSame = httpclient.FromCache(resp.Header)
+			defer resp.Body.Close()
+			data, err := ioutil.ReadAll(resp.Body)
+			if err != nil {
+				return err
+			}
+			d.DocumentData = string(data)
+			d.FilePath = d.FilePath + path
+			return nil
+		}
+		return err
+	}
+	resp, err := gc.client.GetRawUserContent(url)
+	if err := handle(resp, err, ""); err == nil {
+		return nil
+	}
+
+	for _, file := range pgmconfig.KustomizationFileNames {
+		resp, err = gc.client.GetRawUserContent(url + "/" + file)
+		err := handle(resp, err, "/"+file)
+		if err != nil {
+			continue
+		}
+	}
+	return fmt.Errorf("File Not Found: %s", url)
+}
+
+func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
+	fs := GithubFileSpec{}
+	fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
+	creationTime, err := gc.client.GetFileCreationTime(fs)
+	if err != nil {
+		return err
+	}
+	d.CreationTime = &creationTime
+	return nil
+}
+
+func (gc githubCrawler) Match(d *doc.Document) bool {
+	url := d.RepositoryURL + "/" + d.FilePath + "?ref=" + "/" +
+		d.DefaultBranch
+	repoSpec, err := git.NewRepoSpecFromUrl(url)
+	if err != nil {
+		return false
+	}
+
+	return strings.Contains(repoSpec.Host, "github.com")
 }

 // processQuery follows all of the pages in a query, and updates/adds the
 // documents from the crawl to the datastore/index.
 func processQuery(ctx context.Context, gcl GitHubClient, query string,
-	output chan<- *doc.KustomizationDocument) error {
+	output chan<- crawler.CrawlerDocument) error {

 	queryPages := make(chan GithubResponseInfo)

@@ -112,13 +192,6 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
 		}

 		for _, file := range page.Parsed.Items {
-			// TODO(damienr74) This is where we'd need to
-			// communicate with redis. Currently always doing a full
-			// reindex of the documents. Since the documents are in
-			// sorted order in each bucket, we can short circuit the
-			// search when we find a file that has been seen, or we
-			// can choose to selectively update files.
-
 			k, err := kustomizationResultAdapter(gcl, file)
 			if err != nil {
 				errs = append(errs, err)
@@ -137,23 +210,33 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
 }

 func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
-	*doc.KustomizationDocument, error) {
+	crawler.CrawlerDocument, error) {

 	data, err := gcl.GetFileData(k)
 	if err != nil {
 		return nil, err
 	}

-	creationTime, err := gcl.GetFileCreationTime(k)
 	if err != nil {
-		logger.Printf("(error: %v) initializing to current time.", err)
+		logger.Printf(
+			"(error: %v) initializing to current time.\n", err)
+	}
+
+	url := gcl.ReposRequest(k.Repository.FullName)
+	defaultBranch, err := gcl.GetDefaultBranch(url)
+	if err != nil {
+		logger.Printf(
+			"(error: %v) setting default_branch to master\n", err)
+		defaultBranch = "master"
 	}

 	doc := doc.KustomizationDocument{
-		DocumentData:  string(data),
-		FilePath:      k.Path,
-		RepositoryURL: k.Repository.URL,
-		CreationTime:  creationTime,
+		Document: doc.Document{
+			DocumentData:  string(data),
+			FilePath:      k.Path,
+			DefaultBranch: defaultBranch,
+			RepositoryURL: k.Repository.URL,
+		},
 	}

 	return &doc, nil
@@ -227,7 +310,34 @@ func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
 	}

 	defer resp.Body.Close()
-	return ioutil.ReadAll(resp.Body)
+	data, err = ioutil.ReadAll(resp.Body)
+	return data, err
+}
+
+func (gcl GitHubClient) GetDefaultBranch(url string) (string, error) {
+	resp, err := gcl.GetReposData(url)
+	if err != nil {
+		return "", fmt.Errorf(
+			"'%s' could not get default_branch: %v", url, err)
+	}
+	defer resp.Body.Close()
+	data, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return "", fmt.Errorf(
+			"could not read default_branch: %v", err)
+	}
+
+	type defaultBranch struct {
+		DefaultBranch string `json:"default_branch,omitempty"`
+	}
+	var branch defaultBranch
+	err = json.Unmarshal(data, &branch)
+	if err != nil {
+		return "", fmt.Errorf(
+			"default_branch json malformed: %v", err)
+	}
+
+	return branch.DefaultBranch, nil
 }

 // GetFileCreationTime gets the earliest date of a file.
@@ -301,29 +411,23 @@ func throttleRepoAPI() {
 	<-contentRateTicker.C
 }

-const (
-	accessTokenKeyword = "access_token="
-	perPageKeyword     = "per_page="
-	contentSearchURL   = "https://api.github.com/repos"
-	contentKeyword     = "contents"
-)
-
 type multiError []error

 func (me multiError) Error() string {
 	size := len(me) + 2
 	strs := make([]string, size)
-	strs[0] = "Errors [\n\t"
+	strs[0] = "Errors ["
 	for i, err := range me {
-		strs[i+1] = err.Error()
+		strs[i+1] = "\t" + err.Error()
 	}
-	strs[size-1] = "\n]"
-	return strings.Join(strs, "\n\t")
+	strs[size-1] = "]"
+	return strings.Join(strs, "\n")
 }

 type GithubFileSpec struct {
 	Path       string `json:"path,omitempty"`
 	Repository struct {
+		API      string `json:"url,omitempty"`
 		URL      string `json:"html_url,omitempty"`
 		FullName string `json:"full_name,omitempty"`
 	} `json:"repository,omitempty"`
--- a/internal/tools/crawler/github/queries.go
+++ b/internal/tools/crawler/github/queries.go
@@ -127,6 +127,11 @@ func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
 	return rc.makeRequest(uri, Query{}).URL()
 }

+func (rc RequestConfig) ReposRequest(fullRepoName string) string {
+	uri := fmt.Sprintf("repos/%s", fullRepoName)
+	return rc.makeRequest(uri, Query{}).URL()
+}
+
 // CommitsRequest given the repo name, and a filepath returns a formatted query
 // for the Github API to find the commits that affect this file.
 func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
--- a/internal/tools/crawler/github/split_search_ranges.go
+++ b/internal/tools/crawler/github/split_search_ranges.go
@@ -1,5 +1,102 @@
 package github

+// GitHub only returns at most 1000 results per search query,
+// this is problematic if you want to retrieve all the results for a given
+// search query. However, GitHub allows you to specify as much as you want per
+// query to make things more specific. Specifically for files, GitHub allows
+// you to specify their sizes with range queries. This is very convenient
+// since it allows us to split the search into disjoint sets/shards of results
+// from the different file size ranges.
+//
+// Some important factors to consider:
+//
+// -  These queries are rate limited by the API to roughly once query every two
+//    seconds.
+//
+// -  The search space for file sizes is in bytes, from 0B to < 512KiB (this is
+//    a huge search space that cannot be probed linearly in a timely manner if
+//    granularity is to be expected).
+//
+// -  If you have K files there will likely be ~K/1000 sets that you have find
+//    from this search space in order to get all of the results.
+//
+// -  If you have O(K) sets it is unlikely that they are all of the same size,
+//    since (most files are power law distributed). That means that the range
+//    might be significantly smaller for 1000 small files, than it is for
+//    1000 large files.
+//
+// -  This method is a best effort approach. There are some limitations to what
+//    it can and can't do, so please note the following:
+//
+//    +  There may very well be a filesize that has more than 1000 results.
+//       this method cannot help in this case. However, requerying over time
+//       (days/weeks/months) while sorting by last indexed values may be
+//       sufficient to eventually get all of the results.
+//
+//    +  It's possible that the github API returns inconsistent counts. This
+//       is problematic in most cases, since it can cause many issues if the
+//       case is not handled properly. For instance, if you requested the
+//       number of files of an interval from size:0..64 and get that there
+//       are 900 results, you may query at size:0..96 and get that there
+//       are 800 results. To guarantee that this approach completes and does
+//       not get into a query loop over the same intervals, it will retry a few
+//       times and take the largest of the results or the largest previously
+//       queried value from another range (in this case, the implementation
+//       could decide that size:0..96 must have 900) results. This makes the
+//       approach best effort even if there are no single file sizes of over
+//       1000 results.
+//
+//
+// The approach that was taken to solve this problem is the following:
+//
+// 1. Determine the total number of results by querying from the lower bound
+//    to the upper bound (size:0..max). If there are less than 1000 files,
+//    return a single range of values (size:0..max) since all results can be
+//    retrieved.
+//
+// 2. Otherwise, set a target number of files to be 1000.
+//
+// 3. Binary search for the range from 0..r that provides a file count that is
+//    less than or equal to the target. Once this value is found, store the
+//    upper bound of range (r). If r is the same as the previous value, (or 0)
+//    increase r by one (this guarantees progress, but will miss out on some
+//    results).
+//
+// 4. Increase the target by 1000.
+//
+// 5. Repeat steps 3 and 4 until the target is at or exceeds the total number
+//    of files.
+//
+//
+// In general there are other ways to get all of the files from GitHub. In
+// some cases it would be sufficient to just get the files that are being
+// updated/indexed by github periodically to update the corpus, so this
+// complicated approach does not have to be run every time. However, for
+// some searches, there may be too many results on a time interval to do
+// this simple update search limited to only 1000 results.
+//
+// There is also a more sophisticated approach that may yield better
+// performance:
+// -  Perform this search once and create a prior distribution of file sizes.
+//    Each time you want to retrieve the results of the query, scale the
+//    prior of expected ranges to the current number of files. From each
+//    expected range of 1000 files, perform a exponential search to find the
+//    lower bound of the range. This would likely reduce the total number
+//    of queries by a significant amount since it would only have to search
+//    for a small set of values around each likely range boundary.
+//
+// However, actually retrieving the files will be the bottleneck operation
+// since the number of queries to find the ranges will be close to:
+//   log2(maxFileSize) * totalResults / 1000 ~= totalResults / 50
+// whereas the number of queries to actually get all of the search results
+// are close to:
+//   apiCallsPerResult * 10(pages) * 100(resultsPerPage) * totalResults / 1000
+//   = apiCallsPerResult * totalResults.
+//
+// So it could very well take apiCallsPerResult * 50 times longer to acutally
+// fetch the results (assuming the quotas for the API calls are the same as the
+// search API), than it does to perform these range searches.
+
 import (
 	"fmt"
 	"math/bits"
@@ -12,14 +109,20 @@ const (
 	githubMaxResultsPerQuery = uint64(1000)
 )

-// Interface for testing purposes. Not expecting to have multiple
-// implementations.
+// Interface instead of struct for testing purposes.
+// Not expecting to have multiple implementations.
 type cachedSearch interface {
 	CountResults(uint64) (uint64, error)
 	RequestString(filesize rangeFormatter) string
 }

-// Cache uses bit tricks to be more efficient in detecting
+// cachedSearch is a simple data structure that maps the upper bound (r) of a
+// range from 0 to r to the number of files that have between 0 and r files
+// (inclusive). It also guarantees that the counts are monotonically increasing
+// (not strict) as the value for r increases, by looking at the maximal
+// previous file count for the value that precedes r in the cache.
+//
+// It uses a bit trick to be more efficient in detecting
 // inconsistencies in the returned data from the Github API.
 // Therefore, the cache expects a search to always start at 0, and
 // it expects the max file size to be a power of 2. If this is to be changed
@@ -36,11 +139,12 @@ type cachedSearch interface {
 //    problematic). The current cache implementation looks at the
 //    predecessor entry to find out if the current value is monotonic.
 //    This is where the bit trick is used, since each step in the binary
-//    search is adding or ommiting to add a decreasing of 2 to the query value,
-//    we can remove the least significant set bit to find the predecessor in
-//    constant time. Ultimately since the search is rate limited, we could also
-//    easily afford to compute this in linear time by iterating
-//    over cached values.
+//    search is adding or ommiting to add a decreasing power of 2 to the query
+//    value, we can remove the least significant set bit to find the
+//    predecessor in constant time. Ultimately since the search is rate
+//    limited, we could also easily afford to compute this in linear time
+//    by iterating over cached values. So this trick is not crucial to the
+//    cache's performance.
 type githubCachedSearch struct {
 	cache       map[uint64]uint64
 	gcl         GitHubClient
@@ -160,7 +264,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
 	//
 	// My intuition is that this approach is competitive to a perfectly
 	// optimal solution, but I didn't actually take the time to do a
-	// rigurous proof. Intuitively, since files sizes are typically power
+	// rigorous proof. Intuitively, since files sizes are typically power
 	// law distibuted the binary search will be very skewed towards the
 	// smaller file ranges. This means that in practice this approach will
 	// make fewer than (#files/1000)*(log(n) = 19) queries for