mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 09:02:53 +00:00
Introduce dummy program to help with API releases.
This commit is contained in:
236
internal/crawl/crawler/crawler.go
Normal file
236
internal/crawl/crawler/crawler.go
Normal file
@@ -0,0 +1,236 @@
|
||||
// Package crawler provides helper methods and defines an interface for lauching
|
||||
// source repository crawlers that retrieve files from a source and forwards
|
||||
// to a channel for indexing and retrieval.
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
_ "github.com/gomodule/redigo/redis"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
)
|
||||
|
||||
var (
|
||||
logger = log.New(os.Stdout, "Crawler: ", log.LstdFlags|log.LUTC|log.Llongfile)
|
||||
)
|
||||
|
||||
// Crawler forwards documents from source repositories to index and store them
|
||||
// for searching. Each crawler is responsible for querying it's source of
|
||||
// information, and forwarding files that have not been seen before or that need
|
||||
// updating.
|
||||
type Crawler interface {
|
||||
// Crawl returns when it is done processing. This method does not take
|
||||
// ownership of the channel. The channel is write only, and it
|
||||
// designates where the crawler should forward the documents.
|
||||
Crawl(ctx context.Context, output chan<- CrawlerDocument) error
|
||||
|
||||
// Get the document data given the FilePath, Repo, and Ref/Tag/Branch.
|
||||
FetchDocument(context.Context, *doc.Document) error
|
||||
// Write to the document what the created time is.
|
||||
SetCreated(context.Context, *doc.Document) error
|
||||
|
||||
Match(*doc.Document) bool
|
||||
}
|
||||
|
||||
type CrawlerDocument interface {
|
||||
ID() string
|
||||
GetDocument() *doc.Document
|
||||
GetResources() ([]*doc.Document, error)
|
||||
WasCached() bool
|
||||
}
|
||||
|
||||
type CrawlerSeed []*doc.Document
|
||||
|
||||
type IndexFunc func(CrawlerDocument, Crawler) error
|
||||
type Converter func(*doc.Document) (CrawlerDocument, error)
|
||||
|
||||
// Cleaner, more efficient, and more extensible crawler implementation.
|
||||
// The seed must include the ids of each document in the index.
|
||||
func CrawlFromSeed(ctx context.Context, seed CrawlerSeed,
|
||||
crawlers []Crawler, conv Converter, indx IndexFunc) {
|
||||
|
||||
seen := make(map[string]struct{})
|
||||
|
||||
logIfErr := func(err error) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
logger.Println("error: ", err)
|
||||
}
|
||||
|
||||
stack := make(CrawlerSeed, 0)
|
||||
|
||||
findMatch := func(d *doc.Document) Crawler {
|
||||
for _, crawl := range crawlers {
|
||||
if crawl.Match(d) {
|
||||
return crawl
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
addBranches := func(cdoc CrawlerDocument, match Crawler) {
|
||||
if _, ok := seen[cdoc.ID()]; ok {
|
||||
return
|
||||
}
|
||||
|
||||
seen[cdoc.ID()] = struct{}{}
|
||||
// Insert into index
|
||||
err := indx(cdoc, match)
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
deps, err := cdoc.GetResources()
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for _, dep := range deps {
|
||||
if _, ok := seen[dep.ID()]; ok {
|
||||
continue
|
||||
}
|
||||
stack = append(stack, dep)
|
||||
}
|
||||
}
|
||||
|
||||
doCrawl := func(docsPtr *CrawlerSeed) {
|
||||
for len(*docsPtr) > 0 {
|
||||
back := len(*docsPtr) - 1
|
||||
next := (*docsPtr)[back]
|
||||
*docsPtr = (*docsPtr)[:back]
|
||||
|
||||
match := findMatch(next)
|
||||
if match == nil {
|
||||
logIfErr(fmt.Errorf(
|
||||
"%v could not match any crawler", next))
|
||||
continue
|
||||
}
|
||||
|
||||
err := match.FetchDocument(ctx, next)
|
||||
logIfErr(err)
|
||||
// If there was no change or there is an error, we don't have
|
||||
// to branch out, since the dependencies are already in the
|
||||
// index, or we cannot find the document.
|
||||
if err != nil || next.WasCached() {
|
||||
continue
|
||||
}
|
||||
|
||||
cdoc, err := conv(next)
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
addBranches(cdoc, match)
|
||||
}
|
||||
}
|
||||
// Exploit seed to update bulk of corpus.
|
||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||
doCrawl(&seed)
|
||||
// Traverse any new links added while updating corpus.
|
||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
||||
doCrawl(&stack)
|
||||
|
||||
ch := make(chan CrawlerDocument, 1<<10)
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for cdoc := range ch {
|
||||
if _, ok := seen[cdoc.ID()]; ok {
|
||||
continue
|
||||
}
|
||||
match := findMatch(cdoc.GetDocument())
|
||||
if match == nil {
|
||||
logIfErr(fmt.Errorf(
|
||||
"%v could not match any crawler", cdoc))
|
||||
continue
|
||||
}
|
||||
addBranches(cdoc, match)
|
||||
}
|
||||
}()
|
||||
|
||||
// Exploration through APIs.
|
||||
errs := CrawlerRunner(ctx, ch, crawlers)
|
||||
if errs != nil {
|
||||
for _, err := range errs {
|
||||
logIfErr(err)
|
||||
}
|
||||
}
|
||||
close(ch)
|
||||
logger.Println("Processing the new documents from the crawlers' exploration.")
|
||||
wg.Wait()
|
||||
// Handle deps of newly discovered documents.
|
||||
logger.Printf("crawling the %d new documents from the crawlers' exploration.",
|
||||
len(stack))
|
||||
doCrawl(&stack)
|
||||
}
|
||||
|
||||
// CrawlerRunner is a blocking function and only returns once all of the
|
||||
// crawlers are finished with execution.
|
||||
//
|
||||
// This function uses the output channel to forward kustomization documents
|
||||
// from a list of crawlers. The output is to be consumed by a database/search
|
||||
// indexer for later retrieval.
|
||||
//
|
||||
// The return value is an array of errors in which each index represents the
|
||||
// index of the crawler that emitted the error. Although the errors themselves
|
||||
// can be nil, the array will always be exactly the size of the crawlers array.
|
||||
//
|
||||
// Crawler Runner takes in a seed, which represents the documents stored in an
|
||||
// index somewhere. The document data is not required to be populated. If there
|
||||
// are many documents, this is preferable. The order of iteration over the seed
|
||||
// is not garanteed, but the CrawlerRunner does guarantee that every element
|
||||
// from the seed will be processed before any other documents from the
|
||||
// crawlers.
|
||||
func CrawlerRunner(ctx context.Context,
|
||||
output chan<- CrawlerDocument, crawlers []Crawler) []error {
|
||||
|
||||
errs := make([]error, len(crawlers))
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
for i, crawler := range crawlers {
|
||||
// Crawler implementations get their own channels to prevent a
|
||||
// crawler from closing the main output channel.
|
||||
docs := make(chan CrawlerDocument)
|
||||
wg.Add(2)
|
||||
|
||||
// Forward all of the documents from this crawler's channel to
|
||||
// the main output channel.
|
||||
go func(docs <-chan CrawlerDocument) {
|
||||
defer wg.Done()
|
||||
for doc := range docs {
|
||||
output <- doc
|
||||
}
|
||||
}(docs)
|
||||
|
||||
// Run this crawler and capture its returned error.
|
||||
go func(idx int, crawler Crawler,
|
||||
docs chan<- CrawlerDocument) {
|
||||
|
||||
defer func() {
|
||||
wg.Done()
|
||||
if r := recover(); r != nil {
|
||||
errs[idx] = fmt.Errorf(
|
||||
"%+v panicked: %v, additional error %v",
|
||||
crawler, r, errs[idx],
|
||||
)
|
||||
}
|
||||
}()
|
||||
defer close(docs)
|
||||
errs[idx] = crawler.Crawl(ctx, docs)
|
||||
}(i, crawler, docs) // Copies the index and the crawler
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return errs
|
||||
}
|
||||
356
internal/crawl/crawler/crawler_test.go
Normal file
356
internal/crawl/crawler/crawler_test.go
Normal file
@@ -0,0 +1,356 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
|
||||
)
|
||||
|
||||
const (
|
||||
kustomizeRepo = "https://github.com/kubernetes-sigs/kustomize"
|
||||
)
|
||||
|
||||
// Simple crawler that forwards it's list of documents to a provided channel and
|
||||
// returns it's error to the caller.
|
||||
type testCrawler struct {
|
||||
matchPrefix string
|
||||
err error
|
||||
docs []doc.KustomizationDocument
|
||||
lukp map[string]int
|
||||
}
|
||||
|
||||
func (c testCrawler) Match(d *doc.Document) bool {
|
||||
return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix)
|
||||
}
|
||||
|
||||
func (c testCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
|
||||
if i, ok := c.lukp[d.ID()]; ok {
|
||||
d.DocumentData = c.docs[i].DocumentData
|
||||
return nil
|
||||
}
|
||||
for _, suffix := range pgmconfig.KustomizationFileNames {
|
||||
fmt.Println(d.ID(), "/", suffix)
|
||||
i, ok := c.lukp[d.ID()+"/"+suffix]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
d.FilePath += "/" + suffix
|
||||
d.DocumentData = c.docs[i].DocumentData
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("Document %v does not exist for matcher: %s",
|
||||
d, c.matchPrefix)
|
||||
}
|
||||
|
||||
func (c testCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
|
||||
d.CreationTime = &time.Time{}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newCrawler(matchPrefix string, err error,
|
||||
docs []doc.KustomizationDocument) testCrawler {
|
||||
c := testCrawler{
|
||||
matchPrefix: matchPrefix,
|
||||
err: err,
|
||||
docs: docs,
|
||||
lukp: make(map[string]int),
|
||||
}
|
||||
for i, d := range docs {
|
||||
c.lukp[d.ID()] = i
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Crawl implements the Crawler interface for testing.
|
||||
func (c testCrawler) Crawl(ctx context.Context,
|
||||
output chan<- CrawlerDocument) error {
|
||||
|
||||
for i, d := range c.docs {
|
||||
isResource := true
|
||||
for _, suffix := range pgmconfig.KustomizationFileNames {
|
||||
if strings.HasSuffix(d.FilePath, suffix) {
|
||||
isResource = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if isResource {
|
||||
continue
|
||||
}
|
||||
output <- &c.docs[i]
|
||||
}
|
||||
return c.err
|
||||
}
|
||||
|
||||
// Used to make sure that we're comparing documents in order. This is needed
|
||||
// since these documents will be sent concurrently.
|
||||
type sortableDocs []doc.KustomizationDocument
|
||||
|
||||
func (s sortableDocs) Less(i, j int) bool {
|
||||
return s[i].FilePath < s[j].FilePath
|
||||
}
|
||||
|
||||
func (s sortableDocs) Swap(i, j int) {
|
||||
s[i], s[j] = s[j], s[i]
|
||||
}
|
||||
|
||||
func (s sortableDocs) Len() int {
|
||||
return len(s)
|
||||
}
|
||||
|
||||
func TestCrawlerRunner(t *testing.T) {
|
||||
fmt.Println("testing CrawlerRunner")
|
||||
tests := []struct {
|
||||
tc []Crawler
|
||||
errs []error
|
||||
docs sortableDocs
|
||||
}{
|
||||
{
|
||||
tc: []Crawler{
|
||||
testCrawler{
|
||||
docs: []doc.KustomizationDocument{
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc2/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc3/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
testCrawler{err: errors.New("crawler2")},
|
||||
testCrawler{},
|
||||
testCrawler{
|
||||
docs: []doc.KustomizationDocument{
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc2/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
err: errors.New("crawler4"),
|
||||
},
|
||||
},
|
||||
errs: []error{
|
||||
nil,
|
||||
errors.New("crawler2"),
|
||||
nil,
|
||||
errors.New("crawler4"),
|
||||
},
|
||||
docs: sortableDocs{
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc2/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc3/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc2/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
output := make(chan CrawlerDocument)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
// Run the Crawler runner with a list of crawlers.
|
||||
go func() {
|
||||
defer close(output)
|
||||
defer wg.Done()
|
||||
|
||||
errs := CrawlerRunner(context.Background(),
|
||||
output, test.tc)
|
||||
|
||||
// Check that errors are returned as they should be.
|
||||
if !reflect.DeepEqual(errs, test.errs) {
|
||||
t.Errorf("Expected errs (%v) to equal (%v)",
|
||||
errs, test.errs)
|
||||
}
|
||||
|
||||
}()
|
||||
|
||||
// Iterate over the output channel of Crawler runner.
|
||||
returned := make(sortableDocs, 0, len(test.docs))
|
||||
for o := range output {
|
||||
d, ok := o.(*doc.KustomizationDocument)
|
||||
if !ok || d == nil {
|
||||
t.Errorf("%T not expected type (%T)",
|
||||
o, d)
|
||||
}
|
||||
returned = append(returned, *d)
|
||||
}
|
||||
|
||||
// Check that all documents are received.
|
||||
sort.Sort(returned)
|
||||
if !reflect.DeepEqual(returned, test.docs) {
|
||||
t.Errorf("Expected docs (%v) to equal (%v)\n",
|
||||
returned, test.docs)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlFromSeed(t *testing.T) {
|
||||
fmt.Println("testing CrawlFromSeed")
|
||||
|
||||
tests := []struct {
|
||||
seed CrawlerSeed
|
||||
matcher string
|
||||
corpus []doc.KustomizationDocument
|
||||
}{
|
||||
{
|
||||
seed: CrawlerSeed{
|
||||
{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/kustomization.yaml",
|
||||
},
|
||||
{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/kustomization.yaml",
|
||||
},
|
||||
},
|
||||
matcher: kustomizeRepo,
|
||||
corpus: []doc.KustomizationDocument{
|
||||
// Visited from the seed, will be ignored in the crawl.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- deployment.yaml
|
||||
`,
|
||||
}},
|
||||
// Also visited from the seed as a relative resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/deployment.yaml",
|
||||
DocumentData: `
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: hello
|
||||
`,
|
||||
}},
|
||||
// Visited from the seed. Has a remote import.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/other/overlay
|
||||
- service.yaml
|
||||
`,
|
||||
}},
|
||||
// Imported as a base from the seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/overlay/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl1
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl2
|
||||
`,
|
||||
}},
|
||||
// Imported as a resource from the seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/service.yaml",
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl1/kustomization.yml",
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl2/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- ../base
|
||||
- job.yaml
|
||||
`,
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/base/kustomization.yml",
|
||||
}},
|
||||
// Visited from crawling seed imported as resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl2/job.yaml",
|
||||
}},
|
||||
// Visited from the crawler runner.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/base/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- ../app
|
||||
`,
|
||||
}},
|
||||
// Visited from the crawler runner.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/app/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- resource.yaml
|
||||
`,
|
||||
}},
|
||||
// Visited from crawling runner imported as resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/app/resource.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
cr := newCrawler(tc.matcher, nil, tc.corpus)
|
||||
visited := make(map[string]int)
|
||||
CrawlFromSeed(context.Background(), tc.seed, []Crawler{cr},
|
||||
func(d *doc.Document) (CrawlerDocument, error) {
|
||||
return &doc.KustomizationDocument{
|
||||
Document: *d,
|
||||
}, nil
|
||||
},
|
||||
func(d CrawlerDocument, cr Crawler) error {
|
||||
visited[d.ID()]++
|
||||
return nil
|
||||
},
|
||||
)
|
||||
if lv, lc := len(visited), len(tc.corpus); lv != lc {
|
||||
t.Errorf("error: %d of %d documents visited.", lv, lc)
|
||||
t.Errorf("\nvisited (%v)\nexpected (%v).", visited, cr.lukp)
|
||||
}
|
||||
for id, cnt := range visited {
|
||||
if cnt != 1 {
|
||||
t.Errorf("%s not visited once (%d)", id, cnt)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
582
internal/crawl/crawler/github/crawler.go
Normal file
582
internal/crawl/crawler/github/crawler.go
Normal file
@@ -0,0 +1,582 @@
|
||||
// Package github implements the crawler.Crawler interface, getting data
|
||||
// from the Github search API.
|
||||
package github
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/crawler"
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
"sigs.k8s.io/kustomize/internal/tools/httpclient"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/git"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
|
||||
)
|
||||
|
||||
var logger = log.New(os.Stdout, "Github Crawler: ",
|
||||
log.LstdFlags|log.LUTC|log.Llongfile)
|
||||
|
||||
// Implements crawler.Crawler.
|
||||
type githubCrawler struct {
|
||||
client GitHubClient
|
||||
query Query
|
||||
}
|
||||
|
||||
type GitHubClient struct {
|
||||
RequestConfig
|
||||
retryCount uint64
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewClient(accessToken string, retryCount uint64, client *http.Client) GitHubClient {
|
||||
return GitHubClient{
|
||||
retryCount: retryCount,
|
||||
client: client,
|
||||
RequestConfig: RequestConfig{
|
||||
perPage: githubMaxPageSize,
|
||||
accessToken: accessToken,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
query Query) githubCrawler {
|
||||
|
||||
return githubCrawler{
|
||||
client: GitHubClient{
|
||||
retryCount: retryCount,
|
||||
client: client,
|
||||
RequestConfig: RequestConfig{
|
||||
perPage: githubMaxPageSize,
|
||||
accessToken: accessToken,
|
||||
},
|
||||
},
|
||||
query: query,
|
||||
}
|
||||
}
|
||||
|
||||
// Implements crawler.Crawler.
|
||||
func (gc githubCrawler) Crawl(
|
||||
ctx context.Context, output chan<- crawler.CrawlerDocument) error {
|
||||
|
||||
noETagClient := GitHubClient{
|
||||
RequestConfig: gc.client.RequestConfig,
|
||||
client: &http.Client{Timeout: gc.client.client.Timeout},
|
||||
retryCount: gc.client.retryCount,
|
||||
}
|
||||
|
||||
// Since Github returns a max of 1000 results per query, we can use
|
||||
// multiple queries that split the search space into chunks of at most
|
||||
// 1000 files to get all of the data.
|
||||
ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not split %v into ranges, %v\n",
|
||||
gc.query, err)
|
||||
}
|
||||
|
||||
logger.Println("ranges: ", ranges)
|
||||
|
||||
// Query each range for files.
|
||||
errs := make(multiError, 0)
|
||||
for _, query := range ranges {
|
||||
err := processQuery(ctx, gc.client, query, output)
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
|
||||
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
|
||||
repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid repospec: %v", err)
|
||||
}
|
||||
|
||||
url := "https://raw.githubusercontent.com/" + repoSpec.OrgRepo +
|
||||
"/" + repoSpec.Ref + "/" + repoSpec.Path
|
||||
|
||||
handle := func(resp *http.Response, err error, path string) error {
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
d.IsSame = httpclient.FromCache(resp.Header)
|
||||
defer resp.Body.Close()
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d.DocumentData = string(data)
|
||||
d.FilePath = d.FilePath + path
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
resp, err := gc.client.GetRawUserContent(url)
|
||||
if err := handle(resp, err, ""); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, file := range pgmconfig.KustomizationFileNames {
|
||||
resp, err = gc.client.GetRawUserContent(url + "/" + file)
|
||||
err := handle(resp, err, "/"+file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("File Not Found: %s", url)
|
||||
}
|
||||
|
||||
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
|
||||
fs := GithubFileSpec{}
|
||||
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
|
||||
creationTime, err := gc.client.GetFileCreationTime(fs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d.CreationTime = &creationTime
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gc githubCrawler) Match(d *doc.Document) bool {
|
||||
url := d.RepositoryURL + "/" + d.FilePath + "?ref=" + "/" +
|
||||
d.DefaultBranch
|
||||
repoSpec, err := git.NewRepoSpecFromUrl(url)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return strings.Contains(repoSpec.Host, "github.com")
|
||||
}
|
||||
|
||||
// processQuery follows all of the pages in a query, and updates/adds the
|
||||
// documents from the crawl to the datastore/index.
|
||||
func processQuery(ctx context.Context, gcl GitHubClient, query string,
|
||||
output chan<- crawler.CrawlerDocument) error {
|
||||
|
||||
queryPages := make(chan GithubResponseInfo)
|
||||
|
||||
go func() {
|
||||
// Forward the document metadata to the retrieval channel.
|
||||
// This separation allows for concurrent requests for the code
|
||||
// search, and the retrieval portions of the API.
|
||||
err := gcl.ForwardPaginatedQuery(ctx, query, queryPages)
|
||||
if err != nil {
|
||||
// TODO(damienr74) handle this error with redis?
|
||||
logger.Println(err)
|
||||
}
|
||||
close(queryPages)
|
||||
}()
|
||||
|
||||
errs := make(multiError, 0)
|
||||
errorCnt := 0
|
||||
totalCnt := 0
|
||||
for page := range queryPages {
|
||||
if page.Error != nil {
|
||||
errs = append(errs, page.Error)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, file := range page.Parsed.Items {
|
||||
k, err := kustomizationResultAdapter(gcl, file)
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
errorCnt++
|
||||
continue
|
||||
}
|
||||
output <- k
|
||||
totalCnt++
|
||||
}
|
||||
|
||||
logger.Printf("got %d files out of %d from API. %d of %d had errors\n",
|
||||
totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt)
|
||||
}
|
||||
|
||||
return errs
|
||||
}
|
||||
|
||||
func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
|
||||
crawler.CrawlerDocument, error) {
|
||||
|
||||
data, err := gcl.GetFileData(k)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logger.Printf(
|
||||
"(error: %v) initializing to current time.\n", err)
|
||||
}
|
||||
|
||||
url := gcl.ReposRequest(k.Repository.FullName)
|
||||
defaultBranch, err := gcl.GetDefaultBranch(url)
|
||||
if err != nil {
|
||||
logger.Printf(
|
||||
"(error: %v) setting default_branch to master\n", err)
|
||||
defaultBranch = "master"
|
||||
}
|
||||
|
||||
doc := doc.KustomizationDocument{
|
||||
Document: doc.Document{
|
||||
DocumentData: string(data),
|
||||
FilePath: k.Path,
|
||||
DefaultBranch: defaultBranch,
|
||||
RepositoryURL: k.Repository.URL,
|
||||
},
|
||||
}
|
||||
|
||||
return &doc, nil
|
||||
}
|
||||
|
||||
// ForwardPaginatedQuery follows the links to the next pages and performs all of
|
||||
// the queries for a given search query, relaying the data from each request
|
||||
// back to an output channel.
|
||||
func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string,
|
||||
output chan<- GithubResponseInfo) error {
|
||||
|
||||
logger.Println("querying: ", query)
|
||||
response := gcl.parseGithubResponse(query)
|
||||
|
||||
if response.Error != nil {
|
||||
return response.Error
|
||||
}
|
||||
|
||||
output <- response
|
||||
|
||||
for response.LastURL != "" && response.NextURL != "" {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
default:
|
||||
response = gcl.parseGithubResponse(response.NextURL)
|
||||
if response.Error != nil {
|
||||
return response.Error
|
||||
}
|
||||
|
||||
output <- response
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetFileData gets the bytes from a file.
|
||||
func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
|
||||
|
||||
url := gcl.ContentsRequest(k.Repository.FullName, k.Path)
|
||||
|
||||
resp, err := gcl.GetReposData(url)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v",
|
||||
k, url, err)
|
||||
}
|
||||
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v",
|
||||
k, url, err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
type githubContentRawURL struct {
|
||||
DownloadURL string `json:"download_url,omitempty"`
|
||||
}
|
||||
var rawURL githubContentRawURL
|
||||
err = json.Unmarshal(data, &rawURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(
|
||||
"%+v: could not get 'download_url' from '%s' response: %v",
|
||||
k, data, err)
|
||||
}
|
||||
|
||||
resp, err = gcl.GetRawUserContent(rawURL.DownloadURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v",
|
||||
k, rawURL.DownloadURL, err)
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
data, err = ioutil.ReadAll(resp.Body)
|
||||
return data, err
|
||||
}
|
||||
|
||||
func (gcl GitHubClient) GetDefaultBranch(url string) (string, error) {
|
||||
resp, err := gcl.GetReposData(url)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"'%s' could not get default_branch: %v", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"could not read default_branch: %v", err)
|
||||
}
|
||||
|
||||
type defaultBranch struct {
|
||||
DefaultBranch string `json:"default_branch,omitempty"`
|
||||
}
|
||||
var branch defaultBranch
|
||||
err = json.Unmarshal(data, &branch)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"default_branch json malformed: %v", err)
|
||||
}
|
||||
|
||||
return branch.DefaultBranch, nil
|
||||
}
|
||||
|
||||
// GetFileCreationTime gets the earliest date of a file.
|
||||
func (gcl GitHubClient) GetFileCreationTime(
|
||||
k GithubFileSpec) (time.Time, error) {
|
||||
|
||||
url := gcl.CommitsRequest(k.Repository.FullName, k.Path)
|
||||
|
||||
defaultTime := time.Now()
|
||||
|
||||
resp, err := gcl.GetReposData(url)
|
||||
if err != nil {
|
||||
return defaultTime, fmt.Errorf(
|
||||
"%+v: '%s' could not get metadata: %v", k, url, err)
|
||||
}
|
||||
|
||||
type DateSpec struct {
|
||||
Commit struct {
|
||||
Author struct {
|
||||
Date string `json:"date,omitempty"`
|
||||
} `json:"author,omitempty"`
|
||||
} `json:"commit,omitempty"`
|
||||
}
|
||||
|
||||
_, lastURL := parseGithubLinkFormat(resp.Header.Get("link"))
|
||||
if lastURL != "" {
|
||||
resp, err = gcl.GetReposData(lastURL)
|
||||
if err != nil {
|
||||
return defaultTime, fmt.Errorf(
|
||||
"%+v: '%s' could not get metadata: %v",
|
||||
k, lastURL, err)
|
||||
}
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return defaultTime, fmt.Errorf(
|
||||
"%+v: failed to read metadata: %v", k, err)
|
||||
}
|
||||
earliestDate := []DateSpec{}
|
||||
err = json.Unmarshal(data, &earliestDate)
|
||||
size := len(earliestDate)
|
||||
if err != nil || size == 0 {
|
||||
return defaultTime, fmt.Errorf(
|
||||
"%+v: server response '%s' not in expected format: %v",
|
||||
k, data, err)
|
||||
}
|
||||
|
||||
return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date)
|
||||
}
|
||||
|
||||
// TODO(damienr74) change the tickers to actually check api rate limits, reset
|
||||
// times, and throttle requests dynamically based off of current utilization,
|
||||
// instead of hardcoding the documented values, these calls are not quota'd.
|
||||
// This is now especially important, since caching the API requests will reduce
|
||||
// API quota use (so we can actually make more requests in the allotted time
|
||||
// period).
|
||||
//
|
||||
// See https://developer.github.com/v3/rate_limit/ for details.
|
||||
var (
|
||||
searchRateTicker = time.NewTicker(time.Second * 2)
|
||||
contentRateTicker = time.NewTicker(time.Second * 1)
|
||||
)
|
||||
|
||||
func throttleSearchAPI() {
|
||||
<-searchRateTicker.C
|
||||
}
|
||||
|
||||
func throttleRepoAPI() {
|
||||
<-contentRateTicker.C
|
||||
}
|
||||
|
||||
type multiError []error
|
||||
|
||||
func (me multiError) Error() string {
|
||||
size := len(me) + 2
|
||||
strs := make([]string, size)
|
||||
strs[0] = "Errors ["
|
||||
for i, err := range me {
|
||||
strs[i+1] = "\t" + err.Error()
|
||||
}
|
||||
strs[size-1] = "]"
|
||||
return strings.Join(strs, "\n")
|
||||
}
|
||||
|
||||
type GithubFileSpec struct {
|
||||
Path string `json:"path,omitempty"`
|
||||
Repository struct {
|
||||
API string `json:"url,omitempty"`
|
||||
URL string `json:"html_url,omitempty"`
|
||||
FullName string `json:"full_name,omitempty"`
|
||||
} `json:"repository,omitempty"`
|
||||
}
|
||||
|
||||
type githubResponse struct {
|
||||
// MaxUint is reserved as a sentinel value.
|
||||
// This is the number of files that match the query.
|
||||
TotalCount uint64 `json:"total_count,omitempty"`
|
||||
|
||||
// Github representation of a file.
|
||||
Items []GithubFileSpec `json:"items,omitempty"`
|
||||
}
|
||||
|
||||
type GithubResponseInfo struct {
|
||||
*http.Response
|
||||
Parsed *githubResponse
|
||||
Error error
|
||||
NextURL string
|
||||
LastURL string
|
||||
}
|
||||
|
||||
func parseGithubLinkFormat(links string) (string, string) {
|
||||
const (
|
||||
linkNext = "next"
|
||||
linkLast = "last"
|
||||
linkInfoURL = 1
|
||||
linkInfoRel = 2
|
||||
)
|
||||
|
||||
next, last := "", ""
|
||||
linkInfo := regexp.MustCompile(`<(.*)>.*; rel="(last|next)"`)
|
||||
|
||||
for _, link := range strings.Split(links, ",") {
|
||||
linkParse := linkInfo.FindStringSubmatch(link)
|
||||
if len(linkParse) != 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
url := linkParse[linkInfoURL]
|
||||
switch linkParse[linkInfoRel] {
|
||||
case linkNext:
|
||||
next = url
|
||||
case linkLast:
|
||||
last = url
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
return next, last
|
||||
}
|
||||
|
||||
func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo {
|
||||
resp, err := gcl.SearchGithubAPI(getRequest)
|
||||
requestInfo := GithubResponseInfo{
|
||||
Response: resp,
|
||||
Error: err,
|
||||
Parsed: nil,
|
||||
}
|
||||
|
||||
if err != nil || resp == nil {
|
||||
return requestInfo
|
||||
}
|
||||
|
||||
var data []byte
|
||||
defer resp.Body.Close()
|
||||
data, requestInfo.Error = ioutil.ReadAll(resp.Body)
|
||||
if requestInfo.Error != nil {
|
||||
return requestInfo
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
logger.Println("query: ", getRequest)
|
||||
logger.Println("status not OK at the source")
|
||||
logger.Println("header dump", resp.Header)
|
||||
logger.Println("body dump", string(data))
|
||||
requestInfo.Error = fmt.Errorf("request rejected, status '%s'",
|
||||
resp.Status)
|
||||
return requestInfo
|
||||
}
|
||||
|
||||
requestInfo.NextURL, requestInfo.LastURL =
|
||||
parseGithubLinkFormat(resp.Header.Get("link"))
|
||||
|
||||
resultCount := githubResponse{
|
||||
TotalCount: math.MaxUint64,
|
||||
}
|
||||
requestInfo.Error = json.Unmarshal(data, &resultCount)
|
||||
if requestInfo.Error != nil {
|
||||
return requestInfo
|
||||
}
|
||||
|
||||
requestInfo.Parsed = &resultCount
|
||||
|
||||
return requestInfo
|
||||
|
||||
}
|
||||
|
||||
// SearchGithubAPI performs a search query and handles rate limitting for
|
||||
// the 'code/search?' endpoint as well as timed retries in the case of abuse
|
||||
// prevention.
|
||||
func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) {
|
||||
throttleSearchAPI()
|
||||
return gcl.getWithRetry(query)
|
||||
}
|
||||
|
||||
// GetReposData performs a search query and handles rate limitting for
|
||||
// the '/repos' endpoint as well as timed retries in the case of abuse
|
||||
// prevention.
|
||||
func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) {
|
||||
throttleRepoAPI()
|
||||
return gcl.getWithRetry(query)
|
||||
}
|
||||
|
||||
// User content (file contents) is not API rate limited, so there's no use in
|
||||
// throttling this call.
|
||||
func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) {
|
||||
return gcl.getWithRetry(query)
|
||||
}
|
||||
|
||||
func (gcl GitHubClient) getWithRetry(
|
||||
query string) (resp *http.Response, err error) {
|
||||
|
||||
resp, err = gcl.client.Get(query)
|
||||
retryCount := gcl.retryCount
|
||||
|
||||
for err == nil &&
|
||||
resp.StatusCode == http.StatusForbidden &&
|
||||
retryCount > 0 {
|
||||
|
||||
retryTime := resp.Header.Get("Retry-After")
|
||||
i, err := strconv.Atoi(retryTime)
|
||||
if err != nil {
|
||||
return resp, fmt.Errorf(
|
||||
"query '%s' forbidden without 'Retry-After'", query)
|
||||
}
|
||||
logger.Printf(
|
||||
"status forbidden, retring %d more times\n", retryCount)
|
||||
|
||||
logger.Printf("waiting %d seconds before retrying\n", i)
|
||||
time.Sleep(time.Second * time.Duration(i))
|
||||
retryCount--
|
||||
resp, err = gcl.client.Get(query)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return resp, fmt.Errorf("query '%s' could not be processed, %v",
|
||||
query, err)
|
||||
}
|
||||
|
||||
return resp, err
|
||||
}
|
||||
224
internal/crawl/crawler/github/queries.go
Normal file
224
internal/crawl/crawler/github/queries.go
Normal file
@@ -0,0 +1,224 @@
|
||||
package github
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
perPageArg = "per_page"
|
||||
accessTokenArg = "access_token"
|
||||
|
||||
githubMaxPageSize = 100
|
||||
)
|
||||
|
||||
// Implementation detail, not important to external API.
|
||||
type queryField struct {
|
||||
name string
|
||||
value interface{}
|
||||
}
|
||||
|
||||
// Formats a query field.
|
||||
func (qf queryField) String() string {
|
||||
var value string
|
||||
switch v := qf.value.(type) {
|
||||
case string:
|
||||
value = v
|
||||
case rangeFormatter:
|
||||
value = v.RangeString()
|
||||
default:
|
||||
value = fmt.Sprint(v)
|
||||
}
|
||||
|
||||
if qf.name == "" {
|
||||
return value
|
||||
}
|
||||
return fmt.Sprint(qf.name, ":", value)
|
||||
}
|
||||
|
||||
// Example of formating a query:
|
||||
// QueryWith(
|
||||
// Filename("kustomization.yaml"),
|
||||
// Filesize(RangeWithin{64, 192}),
|
||||
// Keyword("copyright"),
|
||||
// Keyword("2019"),
|
||||
// ).String()
|
||||
//
|
||||
// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which
|
||||
// would search for files that have [64, 192] bytes (inclusive range) and that
|
||||
// contain the keywords 'copyright' and '2019' somewhere in the file.
|
||||
type Query []queryField
|
||||
|
||||
func QueryWith(qfs ...queryField) Query {
|
||||
return Query(qfs)
|
||||
}
|
||||
|
||||
func (q Query) String() string {
|
||||
strs := make([]string, 0, len(q))
|
||||
for _, elem := range q {
|
||||
str := elem.String()
|
||||
if str == "" {
|
||||
continue
|
||||
}
|
||||
strs = append(strs, str)
|
||||
}
|
||||
|
||||
query := strings.Join(strs, "+")
|
||||
if query == "" {
|
||||
return query
|
||||
}
|
||||
return "q=" + query
|
||||
}
|
||||
|
||||
// Keyword takes a single word, and formats it according to the Github API.
|
||||
func Keyword(k string) queryField {
|
||||
return queryField{value: k}
|
||||
}
|
||||
|
||||
// Filesize takes a rangeFormatter and formats it according to the Github API.
|
||||
func Filesize(r rangeFormatter) queryField {
|
||||
return queryField{name: "size", value: r}
|
||||
}
|
||||
|
||||
// Filename takes a filename and formats it according to the Github API.
|
||||
func Filename(f string) queryField {
|
||||
return queryField{name: "filename", value: f}
|
||||
}
|
||||
|
||||
// Path takes a filepath and formats it according to the Github API.
|
||||
func Path(p string) queryField {
|
||||
return queryField{name: "path", value: p}
|
||||
}
|
||||
|
||||
// RequestConfig stores common variables that must be present for the queries.
|
||||
// - CodeSearchRequests: ask Github to check the code indices given a query.
|
||||
// - ContentsRequests: ask Github where to download a resource given a repo and a
|
||||
// file path.
|
||||
// - CommitsRequests: asks Github to list commits made one a file. Useful to
|
||||
// determine the date of a file.
|
||||
type RequestConfig struct {
|
||||
perPage uint64
|
||||
accessToken string
|
||||
}
|
||||
|
||||
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
|
||||
return RequestConfig{
|
||||
perPage: perPage,
|
||||
accessToken: accessToken,
|
||||
}
|
||||
}
|
||||
|
||||
// CodeSearchRequestWith given a list of query parameters that specify the
|
||||
// (patial) query, returns a request object with the (parital) query. Must call
|
||||
// the URL method to get the string value of the URL. See request.CopyWith, to
|
||||
// understand why the request object is useful.
|
||||
func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
|
||||
req := rc.makeRequest("search/code", query)
|
||||
req.vals.Set("sort", "indexed")
|
||||
req.vals.Set("order", "desc")
|
||||
return req
|
||||
}
|
||||
|
||||
// ContentsRequest given the repo name, and the filepath returns a formatted
|
||||
// query for the Github API to find the dowload information of this filepath.
|
||||
func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
|
||||
uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path)
|
||||
return rc.makeRequest(uri, Query{}).URL()
|
||||
}
|
||||
|
||||
func (rc RequestConfig) ReposRequest(fullRepoName string) string {
|
||||
uri := fmt.Sprintf("repos/%s", fullRepoName)
|
||||
return rc.makeRequest(uri, Query{}).URL()
|
||||
}
|
||||
|
||||
// CommitsRequest given the repo name, and a filepath returns a formatted query
|
||||
// for the Github API to find the commits that affect this file.
|
||||
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
|
||||
uri := fmt.Sprintf("repos/%s/commits", fullRepoName)
|
||||
return rc.makeRequest(uri, Query{Path(path)}).URL()
|
||||
}
|
||||
|
||||
func (rc RequestConfig) makeRequest(path string, query Query) request {
|
||||
vals := url.Values{}
|
||||
if rc.accessToken != "" {
|
||||
vals.Set(accessTokenArg, rc.accessToken)
|
||||
}
|
||||
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
|
||||
|
||||
return request{
|
||||
url: url.URL{
|
||||
Scheme: "https",
|
||||
Host: "api.github.com",
|
||||
Path: path,
|
||||
},
|
||||
vals: vals,
|
||||
query: query,
|
||||
}
|
||||
}
|
||||
|
||||
type request struct {
|
||||
url url.URL
|
||||
vals url.Values
|
||||
query Query
|
||||
}
|
||||
|
||||
// CopyWith copies the requests and adds the extra query parameters. Usefull
|
||||
// for dynamically adding sizes to a filename only query without modifying it.
|
||||
func (r request) CopyWith(queryParams ...queryField) request {
|
||||
cpy := r
|
||||
cpy.query = append(cpy.query, queryParams...)
|
||||
return cpy
|
||||
}
|
||||
|
||||
// URL encodes the variables and the URL representation into a string.
|
||||
func (r request) URL() string {
|
||||
// Github does not handle URL encoding properly in its API for the
|
||||
// q='...', so the query parameter is added without any encoding
|
||||
// manually.
|
||||
encoded := r.vals.Encode()
|
||||
query := r.query.String()
|
||||
sep := "&"
|
||||
if query == "" {
|
||||
sep = ""
|
||||
}
|
||||
if encoded == "" && query != "" {
|
||||
sep = "?"
|
||||
}
|
||||
r.url.RawQuery = encoded + sep + query
|
||||
return r.url.String()
|
||||
}
|
||||
|
||||
// Allows to define a range of numbers and print it in the github range
|
||||
// query format https://help.github.com/en/articles/understanding-the-search-syntax.
|
||||
type rangeFormatter interface {
|
||||
RangeString() string
|
||||
}
|
||||
|
||||
// RangeLessThan is a range of values strictly less than (<) size.
|
||||
type RangeLessThan struct {
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (r RangeLessThan) RangeString() string {
|
||||
return fmt.Sprintf("<%d", r.size)
|
||||
}
|
||||
|
||||
// RangeLessThan is a range of values strictly greater than (>) size.
|
||||
type RangeGreaterThan struct {
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (r RangeGreaterThan) RangeString() string {
|
||||
return fmt.Sprintf(">%d", r.size)
|
||||
}
|
||||
|
||||
// RangeWithin is an inclusive range from start to end.
|
||||
type RangeWithin struct {
|
||||
start uint64
|
||||
end uint64
|
||||
}
|
||||
|
||||
func (r RangeWithin) RangeString() string {
|
||||
return fmt.Sprintf("%d..%d", r.start, r.end)
|
||||
}
|
||||
119
internal/crawl/crawler/github/queries_test.go
Normal file
119
internal/crawl/crawler/github/queries_test.go
Normal file
@@ -0,0 +1,119 @@
|
||||
package github
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestQueryFields(t *testing.T) {
|
||||
testCases := []struct {
|
||||
formatter queryField
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
formatter: Keyword("keyword"),
|
||||
expected: "keyword",
|
||||
},
|
||||
{
|
||||
formatter: Filesize(RangeLessThan{23}),
|
||||
expected: "size:<23",
|
||||
},
|
||||
{
|
||||
formatter: Filesize(RangeWithin{24, 64}),
|
||||
expected: "size:24..64",
|
||||
},
|
||||
{
|
||||
formatter: Filesize(RangeGreaterThan{64}),
|
||||
expected: "size:>64",
|
||||
},
|
||||
{
|
||||
formatter: Path("some/path/to/file"),
|
||||
expected: "path:some/path/to/file",
|
||||
},
|
||||
{
|
||||
formatter: Filename("kustomization.yaml"),
|
||||
expected: "filename:kustomization.yaml",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
if result := test.formatter.String(); result != test.expected {
|
||||
t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryType(t *testing.T) {
|
||||
testCases := []struct {
|
||||
query Query
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
query: QueryWith(
|
||||
Filesize(RangeWithin{24, 64}),
|
||||
Filename("kustomization.yaml"),
|
||||
Keyword("keyword1"),
|
||||
Keyword("keyword2"),
|
||||
),
|
||||
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
if queryStr := test.query.String(); queryStr != test.expected {
|
||||
t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestGithubSearchQuery(t *testing.T) {
|
||||
const (
|
||||
accessToken = "random_token"
|
||||
perPage = 100
|
||||
)
|
||||
|
||||
testCases := []struct {
|
||||
rc RequestConfig
|
||||
codeQuery Query
|
||||
fullRepoName string
|
||||
path string
|
||||
expectedCodeQuery string
|
||||
expectedContentsQuery string
|
||||
expectedCommitsQuery string
|
||||
}{
|
||||
{
|
||||
rc: RequestConfig{
|
||||
perPage: perPage,
|
||||
accessToken: accessToken,
|
||||
},
|
||||
codeQuery: Query{
|
||||
Filename("kustomization.yaml"),
|
||||
Filesize(RangeWithin{64, 128}),
|
||||
},
|
||||
fullRepoName: "kubernetes-sigs/kustomize",
|
||||
path: "examples/helloWorld/kustomization.yaml",
|
||||
|
||||
expectedCodeQuery: "https://api.github.com/search/code?" +
|
||||
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
|
||||
|
||||
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
|
||||
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
|
||||
|
||||
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
|
||||
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range testCases {
|
||||
if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery {
|
||||
t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery)
|
||||
}
|
||||
|
||||
if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery {
|
||||
t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery)
|
||||
}
|
||||
if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery {
|
||||
t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery)
|
||||
}
|
||||
}
|
||||
}
|
||||
378
internal/crawl/crawler/github/split_search_ranges.go
Normal file
378
internal/crawl/crawler/github/split_search_ranges.go
Normal file
@@ -0,0 +1,378 @@
|
||||
package github
|
||||
|
||||
// GitHub only returns at most 1000 results per search query,
|
||||
// this is problematic if you want to retrieve all the results for a given
|
||||
// search query. However, GitHub allows you to specify as much as you want per
|
||||
// query to make things more specific. Specifically for files, GitHub allows
|
||||
// you to specify their sizes with range queries. This is very convenient
|
||||
// since it allows us to split the search into disjoint sets/shards of results
|
||||
// from the different file size ranges.
|
||||
//
|
||||
// Some important factors to consider:
|
||||
//
|
||||
// - These queries are rate limited by the API to roughly once query every two
|
||||
// seconds.
|
||||
//
|
||||
// - The search space for file sizes is in bytes, from 0B to < 512KiB (this is
|
||||
// a huge search space that cannot be probed linearly in a timely manner if
|
||||
// granularity is to be expected).
|
||||
//
|
||||
// - If you have K files there will likely be ~K/1000 sets that you have find
|
||||
// from this search space in order to get all of the results.
|
||||
//
|
||||
// - If you have O(K) sets it is unlikely that they are all of the same size,
|
||||
// since (most files are power law distributed). That means that the range
|
||||
// might be significantly smaller for 1000 small files, than it is for
|
||||
// 1000 large files.
|
||||
//
|
||||
// - This method is a best effort approach. There are some limitations to what
|
||||
// it can and can't do, so please note the following:
|
||||
//
|
||||
// + There may very well be a filesize that has more than 1000 results.
|
||||
// this method cannot help in this case. However, requerying over time
|
||||
// (days/weeks/months) while sorting by last indexed values may be
|
||||
// sufficient to eventually get all of the results.
|
||||
//
|
||||
// + It's possible that the github API returns inconsistent counts. This
|
||||
// is problematic in most cases, since it can cause many issues if the
|
||||
// case is not handled properly. For instance, if you requested the
|
||||
// number of files of an interval from size:0..64 and get that there
|
||||
// are 900 results, you may query at size:0..96 and get that there
|
||||
// are 800 results. To guarantee that this approach completes and does
|
||||
// not get into a query loop over the same intervals, it will retry a few
|
||||
// times and take the largest of the results or the largest previously
|
||||
// queried value from another range (in this case, the implementation
|
||||
// could decide that size:0..96 must have 900) results. This makes the
|
||||
// approach best effort even if there are no single file sizes of over
|
||||
// 1000 results.
|
||||
//
|
||||
//
|
||||
// The approach that was taken to solve this problem is the following:
|
||||
//
|
||||
// 1. Determine the total number of results by querying from the lower bound
|
||||
// to the upper bound (size:0..max). If there are less than 1000 files,
|
||||
// return a single range of values (size:0..max) since all results can be
|
||||
// retrieved.
|
||||
//
|
||||
// 2. Otherwise, set a target number of files to be 1000.
|
||||
//
|
||||
// 3. Binary search for the range from 0..r that provides a file count that is
|
||||
// less than or equal to the target. Once this value is found, store the
|
||||
// upper bound of range (r). If r is the same as the previous value, (or 0)
|
||||
// increase r by one (this guarantees progress, but will miss out on some
|
||||
// results).
|
||||
//
|
||||
// 4. Increase the target by 1000.
|
||||
//
|
||||
// 5. Repeat steps 3 and 4 until the target is at or exceeds the total number
|
||||
// of files.
|
||||
//
|
||||
//
|
||||
// In general there are other ways to get all of the files from GitHub. In
|
||||
// some cases it would be sufficient to just get the files that are being
|
||||
// updated/indexed by github periodically to update the corpus, so this
|
||||
// complicated approach does not have to be run every time. However, for
|
||||
// some searches, there may be too many results on a time interval to do
|
||||
// this simple update search limited to only 1000 results.
|
||||
//
|
||||
// There is also a more sophisticated approach that may yield better
|
||||
// performance:
|
||||
// - Perform this search once and create a prior distribution of file sizes.
|
||||
// Each time you want to retrieve the results of the query, scale the
|
||||
// prior of expected ranges to the current number of files. From each
|
||||
// expected range of 1000 files, perform a exponential search to find the
|
||||
// lower bound of the range. This would likely reduce the total number
|
||||
// of queries by a significant amount since it would only have to search
|
||||
// for a small set of values around each likely range boundary.
|
||||
//
|
||||
// However, actually retrieving the files will be the bottleneck operation
|
||||
// since the number of queries to find the ranges will be close to:
|
||||
// log2(maxFileSize) * totalResults / 1000 ~= totalResults / 50
|
||||
// whereas the number of queries to actually get all of the search results
|
||||
// are close to:
|
||||
// apiCallsPerResult * 10(pages) * 100(resultsPerPage) * totalResults / 1000
|
||||
// = apiCallsPerResult * totalResults.
|
||||
//
|
||||
// So it could very well take apiCallsPerResult * 50 times longer to acutally
|
||||
// fetch the results (assuming the quotas for the API calls are the same as the
|
||||
// search API), than it does to perform these range searches.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/bits"
|
||||
)
|
||||
|
||||
// Files cannot be more than 2^19 bytes, according to
|
||||
// https://help.github.com/en/articles/searching-code#considerations-for-code-search
|
||||
const (
|
||||
githubMaxFileSize = uint64(1 << 19)
|
||||
githubMaxResultsPerQuery = uint64(1000)
|
||||
)
|
||||
|
||||
// Interface instead of struct for testing purposes.
|
||||
// Not expecting to have multiple implementations.
|
||||
type cachedSearch interface {
|
||||
CountResults(uint64) (uint64, error)
|
||||
RequestString(filesize rangeFormatter) string
|
||||
}
|
||||
|
||||
// cachedSearch is a simple data structure that maps the upper bound (r) of a
|
||||
// range from 0 to r to the number of files that have between 0 and r files
|
||||
// (inclusive). It also guarantees that the counts are monotonically increasing
|
||||
// (not strict) as the value for r increases, by looking at the maximal
|
||||
// previous file count for the value that precedes r in the cache.
|
||||
//
|
||||
// It uses a bit trick to be more efficient in detecting
|
||||
// inconsistencies in the returned data from the Github API.
|
||||
// Therefore, the cache expects a search to always start at 0, and
|
||||
// it expects the max file size to be a power of 2. If this is to be changed
|
||||
// there are a few considerations to keep in mind:
|
||||
//
|
||||
// 1. The cache is only efficient if the queries can be reused, so if
|
||||
// the first chunk of files lives in the range 0..x, continuing the
|
||||
// search for the next chunk from x+1..max (while asymptotically sane)
|
||||
// may actually be less efficient since the cache is essentially reset
|
||||
// at every interval. This leads to a larger number of requests in
|
||||
// practice, and requests are what's expensive (rate limits).
|
||||
//
|
||||
// 2. The github API is not perfectly monotonic.. (this is somewhat
|
||||
// problematic). The current cache implementation looks at the
|
||||
// predecessor entry to find out if the current value is monotonic.
|
||||
// This is where the bit trick is used, since each step in the binary
|
||||
// search is adding or ommiting to add a decreasing power of 2 to the query
|
||||
// value, we can remove the least significant set bit to find the
|
||||
// predecessor in constant time. Ultimately since the search is rate
|
||||
// limited, we could also easily afford to compute this in linear time
|
||||
// by iterating over cached values. So this trick is not crucial to the
|
||||
// cache's performance.
|
||||
type githubCachedSearch struct {
|
||||
cache map[uint64]uint64
|
||||
gcl GitHubClient
|
||||
baseRequest request
|
||||
}
|
||||
|
||||
func newCache(client GitHubClient, query Query) githubCachedSearch {
|
||||
return githubCachedSearch{
|
||||
cache: map[uint64]uint64{
|
||||
0: 0,
|
||||
},
|
||||
gcl: client,
|
||||
baseRequest: client.CodeSearchRequestWith(query),
|
||||
}
|
||||
}
|
||||
|
||||
func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
count, cached := c.cache[upperBound]
|
||||
if cached {
|
||||
return count, nil
|
||||
}
|
||||
|
||||
sizeRange := RangeWithin{0, upperBound}
|
||||
rangeRequest := c.RequestString(sizeRange)
|
||||
|
||||
result := c.gcl.parseGithubResponse(rangeRequest)
|
||||
if result.Error != nil {
|
||||
return count, result.Error
|
||||
}
|
||||
|
||||
// As range search uses powers of 2 for binary search, the previously
|
||||
// cached value is easy to find by removing the least significant set
|
||||
// bit from the current upperBound, since each step of the search adds
|
||||
// least significant set bit.
|
||||
//
|
||||
// Finding the predecessor could also be implemented by iterating over
|
||||
// the map to find the largest key that is smaller than upperBound if
|
||||
// this approach deemed too complex.
|
||||
trail := bits.TrailingZeros64(upperBound)
|
||||
prev := uint64(0)
|
||||
if trail != 64 {
|
||||
prev = upperBound - (1 << uint64(trail))
|
||||
}
|
||||
|
||||
// Sometimes the github API is not monotonically increasing, or ouputs
|
||||
// an erroneous value of 0, or 1. This logic makes sure that it was not
|
||||
// erroneous, and that the sequence continues to be monotonic by setting
|
||||
// the current query count to match the previous value. which at least
|
||||
// guarantees that the range search terminates.
|
||||
//
|
||||
// On the other hand, if files are added, then we way loose out on some
|
||||
// files in a reviously completed range, but these files should be there
|
||||
// the next time the crawler runs, so this is not really problematic.
|
||||
retryMonotonicCount := 4
|
||||
for result.Parsed.TotalCount < c.cache[prev] {
|
||||
logger.Printf(
|
||||
"Retrying query... current lower bound: %d, got: %d\n",
|
||||
c.cache[prev], result.Parsed.TotalCount)
|
||||
|
||||
result = c.gcl.parseGithubResponse(rangeRequest)
|
||||
if result.Error != nil {
|
||||
return count, result.Error
|
||||
}
|
||||
|
||||
retryMonotonicCount--
|
||||
if retryMonotonicCount <= 0 {
|
||||
result.Parsed.TotalCount = c.cache[prev]
|
||||
logger.Println(
|
||||
"Retries for monotonic check exceeded,",
|
||||
" setting value to match predecessor")
|
||||
}
|
||||
}
|
||||
|
||||
count = result.Parsed.TotalCount
|
||||
logger.Printf("Caching new query %s, with count %d\n",
|
||||
sizeRange.RangeString(), count)
|
||||
c.cache[upperBound] = count
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (c githubCachedSearch) RequestString(filesize rangeFormatter) string {
|
||||
return c.baseRequest.CopyWith(Filesize(filesize)).URL()
|
||||
}
|
||||
|
||||
// Outputs a (possibly incomplete) list of ranges to query to find most search
|
||||
// results as permissible by the search github search API. Github search only
|
||||
// allows 1,000 results per query (paginated).
|
||||
// Source: https://developer.github.com/v3/search/
|
||||
//
|
||||
// This leaves the possibility of having file sizes with more than 1000 results,
|
||||
// This would mean that the search as it is could not find all files. If queries
|
||||
// are sorted by last indexed, and retrieved on regular intervals, it should be
|
||||
// sufficient to get most if not all documents.
|
||||
func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
totalFiles, err := cache.CountResults(githubMaxFileSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
logger.Println("total files: ", totalFiles)
|
||||
|
||||
if githubMaxResultsPerQuery >= totalFiles {
|
||||
return []string{
|
||||
cache.RequestString(RangeWithin{0, githubMaxFileSize}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Find all the ranges of file sizes such that all files are queryable
|
||||
// using the Github API. This does not compute an optimal ranges, since
|
||||
// the number of queries needed to get the information required to
|
||||
// compute an optimal range is expected to be much larger than the
|
||||
// number of queries performed this way.
|
||||
//
|
||||
// The number of ranges is k = (number of files)/1000, and finding a
|
||||
// range is logarithmic in the max file size (n = filesize). This means
|
||||
// that preprocessing takes O(k * lg n) queries to find the ranges with
|
||||
// a binary search over file sizes.
|
||||
//
|
||||
// My intuition is that this approach is competitive to a perfectly
|
||||
// optimal solution, but I didn't actually take the time to do a
|
||||
// rigorous proof. Intuitively, since files sizes are typically power
|
||||
// law distibuted the binary search will be very skewed towards the
|
||||
// smaller file ranges. This means that in practice this approach will
|
||||
// make fewer than (#files/1000)*(log(n) = 19) queries for
|
||||
// preprocessing, since it reuses a lot of the queries in the denser
|
||||
// ranges. Furthermore, because of the distribution, it should be very
|
||||
// easy to find ranges that are very close to the upper bound, up to
|
||||
// the limiting factor of having no more than 1000 files accessible per
|
||||
// range.
|
||||
filesAccessible := uint64(0)
|
||||
sizes := make([]uint64, 0)
|
||||
for filesAccessible < totalFiles {
|
||||
target := filesAccessible + githubMaxResultsPerQuery
|
||||
if target >= totalFiles {
|
||||
break
|
||||
}
|
||||
|
||||
logger.Printf("%d accessible files, next target = %d\n",
|
||||
filesAccessible, target)
|
||||
|
||||
cur, err := lowerBoundFileCount(cache, target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// If there are more than 1000 files in the next bucket, we must
|
||||
// advance anyway and lose out on some files :(.
|
||||
if l := len(sizes); l > 0 && sizes[l-1] == cur {
|
||||
cur++
|
||||
}
|
||||
|
||||
nextAccessible, err := cache.CountResults(cur)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(
|
||||
"cache should be populated at %d already, got %v",
|
||||
cur, err)
|
||||
}
|
||||
if nextAccessible < filesAccessible {
|
||||
return nil, fmt.Errorf(
|
||||
"number of results dropped from %d to %d within range search",
|
||||
filesAccessible, nextAccessible)
|
||||
}
|
||||
|
||||
filesAccessible = nextAccessible
|
||||
if nextAccessible < totalFiles {
|
||||
sizes = append(sizes, cur)
|
||||
}
|
||||
}
|
||||
|
||||
return formatFilesizeRanges(cache, sizes), nil
|
||||
}
|
||||
|
||||
// lowerBoundFileCount finds the filesize range from [0, return value] that has
|
||||
// the largest file count that is smaller than or equal to
|
||||
// githubMaxResultsPerQuery. It is important to note that this returned value
|
||||
// could already be in a previous range if the next file size has more than 1000
|
||||
// results. It is left to the caller to handle this bit of logic and guarantee
|
||||
// forward progession in this case.
|
||||
func lowerBoundFileCount(
|
||||
cache cachedSearch, targetFileCount uint64) (uint64, error) {
|
||||
|
||||
// Binary search for file sizes that make up the next <=1000 element
|
||||
// chunk.
|
||||
cur := uint64(0)
|
||||
increase := githubMaxFileSize / 2
|
||||
|
||||
for increase > 0 {
|
||||
mid := cur + increase
|
||||
|
||||
count, err := cache.CountResults(mid)
|
||||
if err != nil {
|
||||
return count, err
|
||||
}
|
||||
|
||||
if count <= targetFileCount {
|
||||
cur = mid
|
||||
}
|
||||
|
||||
if count == targetFileCount {
|
||||
break
|
||||
}
|
||||
|
||||
increase /= 2
|
||||
}
|
||||
|
||||
return cur, nil
|
||||
}
|
||||
|
||||
func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string {
|
||||
ranges := make([]string, 0, len(sizes)+1)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeLessThan{sizes[0] + 1},
|
||||
))
|
||||
}
|
||||
|
||||
for i := 0; i < len(sizes)-1; i += 1 {
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeWithin{sizes[i] + 1, sizes[i+1]},
|
||||
))
|
||||
|
||||
if i != len(sizes)-2 {
|
||||
continue
|
||||
}
|
||||
ranges = append(ranges, cache.RequestString(
|
||||
RangeGreaterThan{sizes[i+1]},
|
||||
))
|
||||
}
|
||||
|
||||
return ranges
|
||||
}
|
||||
90
internal/crawl/crawler/github/split_search_ranges_test.go
Normal file
90
internal/crawl/crawler/github/split_search_ranges_test.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package github
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testCachedSearch struct {
|
||||
cache map[uint64]uint64
|
||||
}
|
||||
|
||||
func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
|
||||
fmt.Printf("CountResults(%05x)\n", upperBound)
|
||||
count, ok := c.cache[upperBound]
|
||||
if !ok {
|
||||
return count, fmt.Errorf("cache not set at %x", upperBound)
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (c testCachedSearch) RequestString(filesize rangeFormatter) string {
|
||||
return filesize.RangeString()
|
||||
}
|
||||
|
||||
// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test
|
||||
// cache take in a list of (filesize, count) pairs and it can populate the cache
|
||||
// without relying on how the implementation will create queries. This was only
|
||||
// a quick and dirty test to make sure that modifications are not going to break
|
||||
// the functionality.
|
||||
func TestRangeSplitting(t *testing.T) {
|
||||
// Keys follow the binary search depending on whether or not the range
|
||||
// is too small/large to find close to optimal filesize ranges. This
|
||||
// test is heavily tied to the fact that the search is using powers of two
|
||||
// to make progress in the search (hence the use of hexadecimal values).
|
||||
cache := testCachedSearch{
|
||||
map[uint64]uint64{
|
||||
0x80000: 5000,
|
||||
0x40000: 5000,
|
||||
0x20000: 5000,
|
||||
0x10000: 5000,
|
||||
0x08000: 5000,
|
||||
0x04000: 5000,
|
||||
0x02000: 5000,
|
||||
0x01000: 5000,
|
||||
0x00fff: 3950,
|
||||
0x00ffe: 3950,
|
||||
0x00ffc: 3950,
|
||||
0x00ff8: 3950,
|
||||
0x00ff0: 3950,
|
||||
0x00fe0: 3950,
|
||||
0x00fc0: 3950,
|
||||
0x00f80: 3950,
|
||||
0x00f00: 3950,
|
||||
0x00e00: 3950,
|
||||
0x00c00: 3950,
|
||||
0x00800: 3950,
|
||||
0x00400: 3950,
|
||||
0x00200: 3688,
|
||||
0x00180: 3028,
|
||||
0x00100: 2999,
|
||||
0x000c0: 2448,
|
||||
0x00080: 1999,
|
||||
0x00070: 1600,
|
||||
0x0006c: 1003,
|
||||
0x0006b: 1001,
|
||||
0x0006a: 999,
|
||||
0x00068: 999,
|
||||
0x00060: 999,
|
||||
0x00040: 999,
|
||||
0x00000: 0,
|
||||
},
|
||||
}
|
||||
|
||||
requests, err := FindRangesForRepoSearch(cache)
|
||||
if err != nil {
|
||||
t.Errorf("Error while finding ranges: %v", err)
|
||||
}
|
||||
expected := []string{
|
||||
"<107", // cache.RequestString(RangeLessThan{0x6b}),
|
||||
"107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}),
|
||||
"129..256", // cache.RequestString(RangeWithin{0x81, 0x100}),
|
||||
"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
|
||||
">4095", // cache.RequestString(RangeGreaterThan{0xfff}),
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(requests, expected) {
|
||||
t.Errorf("Expected requests (%v) to equal (%v)", requests, expected)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user