mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 00:52:55 +00:00
Crawler performance improvements, better structure
Refactored the crawler implementation to make the whole thing more testable. Added a document interface to make the crawler generic. This will be useful for collecting plugins, and other documents.
This commit is contained in:
@@ -6,11 +6,19 @@ package crawler
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
_ "github.com/gomodule/redigo/redis"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
)
|
||||
|
||||
var (
|
||||
logger = log.New(os.Stdout, "Crawler: ", log.LstdFlags|log.LUTC|log.Llongfile)
|
||||
)
|
||||
|
||||
// Crawler forwards documents from source repositories to index and store them
|
||||
// for searching. Each crawler is responsible for querying it's source of
|
||||
// information, and forwarding files that have not been seen before or that need
|
||||
@@ -19,7 +27,152 @@ type Crawler interface {
|
||||
// Crawl returns when it is done processing. This method does not take
|
||||
// ownership of the channel. The channel is write only, and it
|
||||
// designates where the crawler should forward the documents.
|
||||
Crawl(ctx context.Context, output chan<- *doc.KustomizationDocument) error
|
||||
Crawl(ctx context.Context, output chan<- CrawlerDocument) error
|
||||
|
||||
// Get the document data given the FilePath, Repo, and Ref/Tag/Branch.
|
||||
FetchDocument(context.Context, *doc.Document) error
|
||||
// Write to the document what the created time is.
|
||||
SetCreated(context.Context, *doc.Document) error
|
||||
|
||||
Match(*doc.Document) bool
|
||||
}
|
||||
|
||||
type CrawlerDocument interface {
|
||||
ID() string
|
||||
GetDocument() *doc.Document
|
||||
GetResources() ([]*doc.Document, error)
|
||||
WasCached() bool
|
||||
}
|
||||
|
||||
type CrawlerSeed []*doc.Document
|
||||
|
||||
type IndexFunc func(CrawlerDocument, Crawler) error
|
||||
type Converter func(*doc.Document) (CrawlerDocument, error)
|
||||
|
||||
// Cleaner, more efficient, and more extensible crawler implementation.
|
||||
// The seed must include the ids of each document in the index.
|
||||
func CrawlFromSeed(ctx context.Context, seed CrawlerSeed,
|
||||
crawlers []Crawler, conv Converter, indx IndexFunc) {
|
||||
|
||||
seen := make(map[string]struct{})
|
||||
|
||||
logIfErr := func(err error) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
logger.Println("error: ", err)
|
||||
}
|
||||
|
||||
stack := make(CrawlerSeed, 0)
|
||||
|
||||
findMatch := func(d *doc.Document) Crawler {
|
||||
for _, crawl := range crawlers {
|
||||
if crawl.Match(d) {
|
||||
return crawl
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
addBranches := func(cdoc CrawlerDocument, match Crawler) {
|
||||
if _, ok := seen[cdoc.ID()]; ok {
|
||||
return
|
||||
}
|
||||
|
||||
seen[cdoc.ID()] = struct{}{}
|
||||
// Insert into index
|
||||
err := indx(cdoc, match)
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
deps, err := cdoc.GetResources()
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for _, dep := range deps {
|
||||
if _, ok := seen[dep.ID()]; ok {
|
||||
continue
|
||||
}
|
||||
stack = append(stack, dep)
|
||||
}
|
||||
}
|
||||
|
||||
doCrawl := func(docsPtr *CrawlerSeed) {
|
||||
for len(*docsPtr) > 0 {
|
||||
back := len(*docsPtr) - 1
|
||||
next := (*docsPtr)[back]
|
||||
*docsPtr = (*docsPtr)[:back]
|
||||
|
||||
match := findMatch(next)
|
||||
if match == nil {
|
||||
logIfErr(fmt.Errorf(
|
||||
"%v could not match any crawler", next))
|
||||
continue
|
||||
}
|
||||
|
||||
err := match.FetchDocument(ctx, next)
|
||||
logIfErr(err)
|
||||
// If there was no change or there is an error, we don't have
|
||||
// to branch out, since the dependencies are already in the
|
||||
// index, or we cannot find the document.
|
||||
if err != nil || next.WasCached() {
|
||||
continue
|
||||
}
|
||||
|
||||
cdoc, err := conv(next)
|
||||
logIfErr(err)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
addBranches(cdoc, match)
|
||||
}
|
||||
}
|
||||
// Exploit seed to update bulk of corpus.
|
||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||
doCrawl(&seed)
|
||||
// Traverse any new links added while updating corpus.
|
||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
||||
doCrawl(&stack)
|
||||
|
||||
ch := make(chan CrawlerDocument, 1<<10)
|
||||
wg := sync.WaitGroup{}
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for cdoc := range ch {
|
||||
if _, ok := seen[cdoc.ID()]; ok {
|
||||
continue
|
||||
}
|
||||
match := findMatch(cdoc.GetDocument())
|
||||
if match == nil {
|
||||
logIfErr(fmt.Errorf(
|
||||
"%v could not match any crawler", cdoc))
|
||||
continue
|
||||
}
|
||||
addBranches(cdoc, match)
|
||||
}
|
||||
}()
|
||||
|
||||
// Exploration through APIs.
|
||||
errs := CrawlerRunner(ctx, ch, crawlers)
|
||||
if errs != nil {
|
||||
for _, err := range errs {
|
||||
logIfErr(err)
|
||||
}
|
||||
}
|
||||
close(ch)
|
||||
logger.Println("Processing the new documents from the crawlers' exploration.")
|
||||
wg.Wait()
|
||||
// Handle deps of newly discovered documents.
|
||||
logger.Printf("crawling the %d new documents from the crawlers' exploration.",
|
||||
len(stack))
|
||||
doCrawl(&stack)
|
||||
}
|
||||
|
||||
// CrawlerRunner is a blocking function and only returns once all of the
|
||||
@@ -32,8 +185,15 @@ type Crawler interface {
|
||||
// The return value is an array of errors in which each index represents the
|
||||
// index of the crawler that emitted the error. Although the errors themselves
|
||||
// can be nil, the array will always be exactly the size of the crawlers array.
|
||||
//
|
||||
// Crawler Runner takes in a seed, which represents the documents stored in an
|
||||
// index somewhere. The document data is not required to be populated. If there
|
||||
// are many documents, this is preferable. The order of iteration over the seed
|
||||
// is not garanteed, but the CrawlerRunner does guarantee that every element
|
||||
// from the seed will be processed before any other documents from the
|
||||
// crawlers.
|
||||
func CrawlerRunner(ctx context.Context,
|
||||
output chan<- *doc.KustomizationDocument, crawlers []Crawler) []error {
|
||||
output chan<- CrawlerDocument, crawlers []Crawler) []error {
|
||||
|
||||
errs := make([]error, len(crawlers))
|
||||
wg := sync.WaitGroup{}
|
||||
@@ -41,12 +201,12 @@ func CrawlerRunner(ctx context.Context,
|
||||
for i, crawler := range crawlers {
|
||||
// Crawler implementations get their own channels to prevent a
|
||||
// crawler from closing the main output channel.
|
||||
docs := make(chan *doc.KustomizationDocument)
|
||||
docs := make(chan CrawlerDocument)
|
||||
wg.Add(2)
|
||||
|
||||
// Forward all of the documents from this crawler's channel to
|
||||
// the main output channel.
|
||||
go func(docs <-chan *doc.KustomizationDocument) {
|
||||
go func(docs <-chan CrawlerDocument) {
|
||||
defer wg.Done()
|
||||
for doc := range docs {
|
||||
output <- doc
|
||||
@@ -55,7 +215,7 @@ func CrawlerRunner(ctx context.Context,
|
||||
|
||||
// Run this crawler and capture its returned error.
|
||||
go func(idx int, crawler Crawler,
|
||||
docs chan<- *doc.KustomizationDocument) {
|
||||
docs chan<- CrawlerDocument) {
|
||||
|
||||
defer func() {
|
||||
wg.Done()
|
||||
|
||||
@@ -3,26 +3,88 @@ package crawler
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
|
||||
)
|
||||
|
||||
const (
|
||||
kustomizeRepo = "https://github.com/kubernetes-sigs/kustomize"
|
||||
)
|
||||
|
||||
// Simple crawler that forwards it's list of documents to a provided channel and
|
||||
// returns it's error to the caller.
|
||||
type testCrawler struct {
|
||||
docs []doc.KustomizationDocument
|
||||
err error
|
||||
matchPrefix string
|
||||
err error
|
||||
docs []doc.KustomizationDocument
|
||||
lukp map[string]int
|
||||
}
|
||||
|
||||
func (c testCrawler) Match(d *doc.Document) bool {
|
||||
return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix)
|
||||
}
|
||||
|
||||
func (c testCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
|
||||
if i, ok := c.lukp[d.ID()]; ok {
|
||||
d.DocumentData = c.docs[i].DocumentData
|
||||
return nil
|
||||
}
|
||||
for _, suffix := range pgmconfig.KustomizationFileNames {
|
||||
fmt.Println(d.ID(), "/", suffix)
|
||||
i, ok := c.lukp[d.ID()+"/"+suffix]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
d.FilePath += "/" + suffix
|
||||
d.DocumentData = c.docs[i].DocumentData
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("Document %v does not exist for matcher: %s",
|
||||
d, c.matchPrefix)
|
||||
}
|
||||
|
||||
func (c testCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
|
||||
d.CreationTime = &time.Time{}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newCrawler(matchPrefix string, err error,
|
||||
docs []doc.KustomizationDocument) testCrawler {
|
||||
c := testCrawler{
|
||||
matchPrefix: matchPrefix,
|
||||
err: err,
|
||||
docs: docs,
|
||||
lukp: make(map[string]int),
|
||||
}
|
||||
for i, d := range docs {
|
||||
c.lukp[d.ID()] = i
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// Crawl implements the Crawler interface for testing.
|
||||
func (c testCrawler) Crawl(ctx context.Context,
|
||||
output chan<- *doc.KustomizationDocument) error {
|
||||
output chan<- CrawlerDocument) error {
|
||||
|
||||
for i := range c.docs {
|
||||
for i, d := range c.docs {
|
||||
isResource := true
|
||||
for _, suffix := range pgmconfig.KustomizationFileNames {
|
||||
if strings.HasSuffix(d.FilePath, suffix) {
|
||||
isResource = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if isResource {
|
||||
continue
|
||||
}
|
||||
output <- &c.docs[i]
|
||||
}
|
||||
return c.err
|
||||
@@ -45,6 +107,7 @@ func (s sortableDocs) Len() int {
|
||||
}
|
||||
|
||||
func TestCrawlerRunner(t *testing.T) {
|
||||
fmt.Println("testing CrawlerRunner")
|
||||
tests := []struct {
|
||||
tc []Crawler
|
||||
errs []error
|
||||
@@ -54,17 +117,27 @@ func TestCrawlerRunner(t *testing.T) {
|
||||
tc: []Crawler{
|
||||
testCrawler{
|
||||
docs: []doc.KustomizationDocument{
|
||||
{FilePath: "crawler1/doc1"},
|
||||
{FilePath: "crawler1/doc2"},
|
||||
{FilePath: "crawler1/doc3"},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc2/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc3/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
testCrawler{err: errors.New("crawler2")},
|
||||
testCrawler{},
|
||||
testCrawler{
|
||||
docs: []doc.KustomizationDocument{
|
||||
{FilePath: "crawler4/doc1"},
|
||||
{FilePath: "crawler4/doc2"},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc2/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
err: errors.New("crawler4"),
|
||||
},
|
||||
@@ -76,17 +149,27 @@ func TestCrawlerRunner(t *testing.T) {
|
||||
errors.New("crawler4"),
|
||||
},
|
||||
docs: sortableDocs{
|
||||
{FilePath: "crawler1/doc1"},
|
||||
{FilePath: "crawler1/doc2"},
|
||||
{FilePath: "crawler1/doc3"},
|
||||
{FilePath: "crawler4/doc1"},
|
||||
{FilePath: "crawler4/doc2"},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc2/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler1/doc3/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc1/kustomization.yaml",
|
||||
}},
|
||||
{Document: doc.Document{
|
||||
FilePath: "crawler4/doc2/kustomization.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
output := make(chan *doc.KustomizationDocument)
|
||||
output := make(chan CrawlerDocument)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
@@ -95,8 +178,8 @@ func TestCrawlerRunner(t *testing.T) {
|
||||
defer close(output)
|
||||
defer wg.Done()
|
||||
|
||||
errs := CrawlerRunner(context.Background(), output,
|
||||
test.tc)
|
||||
errs := CrawlerRunner(context.Background(),
|
||||
output, test.tc)
|
||||
|
||||
// Check that errors are returned as they should be.
|
||||
if !reflect.DeepEqual(errs, test.errs) {
|
||||
@@ -108,8 +191,13 @@ func TestCrawlerRunner(t *testing.T) {
|
||||
|
||||
// Iterate over the output channel of Crawler runner.
|
||||
returned := make(sortableDocs, 0, len(test.docs))
|
||||
for doc := range output {
|
||||
returned = append(returned, *doc)
|
||||
for o := range output {
|
||||
d, ok := o.(*doc.KustomizationDocument)
|
||||
if !ok || d == nil {
|
||||
t.Errorf("%T not expected type (%T)",
|
||||
o, d)
|
||||
}
|
||||
returned = append(returned, *d)
|
||||
}
|
||||
|
||||
// Check that all documents are received.
|
||||
@@ -122,3 +210,147 @@ func TestCrawlerRunner(t *testing.T) {
|
||||
wg.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrawlFromSeed(t *testing.T) {
|
||||
fmt.Println("testing CrawlFromSeed")
|
||||
|
||||
tests := []struct {
|
||||
seed CrawlerSeed
|
||||
matcher string
|
||||
corpus []doc.KustomizationDocument
|
||||
}{
|
||||
{
|
||||
seed: CrawlerSeed{
|
||||
{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/kustomization.yaml",
|
||||
},
|
||||
{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/kustomization.yaml",
|
||||
},
|
||||
},
|
||||
matcher: kustomizeRepo,
|
||||
corpus: []doc.KustomizationDocument{
|
||||
// Visited from the seed, will be ignored in the crawl.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- deployment.yaml
|
||||
`,
|
||||
}},
|
||||
// Also visited from the seed as a relative resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/helloWorld/deployment.yaml",
|
||||
DocumentData: `
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: hello
|
||||
`,
|
||||
}},
|
||||
// Visited from the seed. Has a remote import.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/other/overlay
|
||||
- service.yaml
|
||||
`,
|
||||
}},
|
||||
// Imported as a base from the seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/overlay/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl1
|
||||
- https://github.com/kubernetes-sigs/kustomize/examples/seedcrawl2
|
||||
`,
|
||||
}},
|
||||
// Imported as a resource from the seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/service.yaml",
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl1/kustomization.yml",
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl2/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- ../base
|
||||
- job.yaml
|
||||
`,
|
||||
}},
|
||||
// Visited from crawling seed.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/base/kustomization.yml",
|
||||
}},
|
||||
// Visited from crawling seed imported as resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/seedcrawl2/job.yaml",
|
||||
}},
|
||||
// Visited from the crawler runner.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/base/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- ../app
|
||||
`,
|
||||
}},
|
||||
// Visited from the crawler runner.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/app/kustomization.yaml",
|
||||
DocumentData: `
|
||||
resources:
|
||||
- resource.yaml
|
||||
`,
|
||||
}},
|
||||
// Visited from crawling runner imported as resource.
|
||||
{Document: doc.Document{
|
||||
RepositoryURL: kustomizeRepo,
|
||||
FilePath: "examples/other/app/resource.yaml",
|
||||
}},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
cr := newCrawler(tc.matcher, nil, tc.corpus)
|
||||
visited := make(map[string]int)
|
||||
CrawlFromSeed(context.Background(), tc.seed, []Crawler{cr},
|
||||
func(d *doc.Document) (CrawlerDocument, error) {
|
||||
return &doc.KustomizationDocument{
|
||||
Document: *d,
|
||||
}, nil
|
||||
},
|
||||
func(d CrawlerDocument, cr Crawler) error {
|
||||
visited[d.ID()]++
|
||||
return nil
|
||||
},
|
||||
)
|
||||
if lv, lc := len(visited), len(tc.corpus); lv != lc {
|
||||
t.Errorf("error: %d of %d documents visited.", lv, lc)
|
||||
t.Errorf("\nvisited (%v)\nexpected (%v).", visited, cr.lukp)
|
||||
}
|
||||
for id, cnt := range visited {
|
||||
if cnt != 1 {
|
||||
t.Errorf("%s not visited once (%d)", id, cnt)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,11 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/internal/tools/crawler"
|
||||
"sigs.k8s.io/kustomize/internal/tools/doc"
|
||||
"sigs.k8s.io/kustomize/internal/tools/httpclient"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/git"
|
||||
"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
|
||||
)
|
||||
|
||||
var logger = log.New(os.Stdout, "Github Crawler: ",
|
||||
@@ -34,6 +38,17 @@ type GitHubClient struct {
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
func NewClient(accessToken string, retryCount uint64, client *http.Client) GitHubClient {
|
||||
return GitHubClient{
|
||||
retryCount: retryCount,
|
||||
client: client,
|
||||
RequestConfig: RequestConfig{
|
||||
perPage: githubMaxPageSize,
|
||||
accessToken: accessToken,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
query Query) githubCrawler {
|
||||
|
||||
@@ -52,7 +67,7 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
|
||||
// Implements crawler.Crawler.
|
||||
func (gc githubCrawler) Crawl(
|
||||
ctx context.Context, output chan<- *doc.KustomizationDocument) error {
|
||||
ctx context.Context, output chan<- crawler.CrawlerDocument) error {
|
||||
|
||||
noETagClient := GitHubClient{
|
||||
RequestConfig: gc.client.RequestConfig,
|
||||
@@ -80,13 +95,78 @@ func (gc githubCrawler) Crawl(
|
||||
}
|
||||
}
|
||||
|
||||
return errs
|
||||
if len(errs) > 0 {
|
||||
return errs
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
|
||||
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
|
||||
repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid repospec: %v", err)
|
||||
}
|
||||
|
||||
url := "https://raw.githubusercontent.com/" + repoSpec.OrgRepo +
|
||||
"/" + repoSpec.Ref + "/" + repoSpec.Path
|
||||
|
||||
handle := func(resp *http.Response, err error, path string) error {
|
||||
if err == nil && resp.StatusCode == http.StatusOK {
|
||||
d.IsSame = httpclient.FromCache(resp.Header)
|
||||
defer resp.Body.Close()
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d.DocumentData = string(data)
|
||||
d.FilePath = d.FilePath + path
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
resp, err := gc.client.GetRawUserContent(url)
|
||||
if err := handle(resp, err, ""); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, file := range pgmconfig.KustomizationFileNames {
|
||||
resp, err = gc.client.GetRawUserContent(url + "/" + file)
|
||||
err := handle(resp, err, "/"+file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("File Not Found: %s", url)
|
||||
}
|
||||
|
||||
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
|
||||
fs := GithubFileSpec{}
|
||||
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
|
||||
creationTime, err := gc.client.GetFileCreationTime(fs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
d.CreationTime = &creationTime
|
||||
return nil
|
||||
}
|
||||
|
||||
func (gc githubCrawler) Match(d *doc.Document) bool {
|
||||
url := d.RepositoryURL + "/" + d.FilePath + "?ref=" + "/" +
|
||||
d.DefaultBranch
|
||||
repoSpec, err := git.NewRepoSpecFromUrl(url)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return strings.Contains(repoSpec.Host, "github.com")
|
||||
}
|
||||
|
||||
// processQuery follows all of the pages in a query, and updates/adds the
|
||||
// documents from the crawl to the datastore/index.
|
||||
func processQuery(ctx context.Context, gcl GitHubClient, query string,
|
||||
output chan<- *doc.KustomizationDocument) error {
|
||||
output chan<- crawler.CrawlerDocument) error {
|
||||
|
||||
queryPages := make(chan GithubResponseInfo)
|
||||
|
||||
@@ -112,13 +192,6 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
|
||||
}
|
||||
|
||||
for _, file := range page.Parsed.Items {
|
||||
// TODO(damienr74) This is where we'd need to
|
||||
// communicate with redis. Currently always doing a full
|
||||
// reindex of the documents. Since the documents are in
|
||||
// sorted order in each bucket, we can short circuit the
|
||||
// search when we find a file that has been seen, or we
|
||||
// can choose to selectively update files.
|
||||
|
||||
k, err := kustomizationResultAdapter(gcl, file)
|
||||
if err != nil {
|
||||
errs = append(errs, err)
|
||||
@@ -137,23 +210,33 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
|
||||
}
|
||||
|
||||
func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
|
||||
*doc.KustomizationDocument, error) {
|
||||
crawler.CrawlerDocument, error) {
|
||||
|
||||
data, err := gcl.GetFileData(k)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
creationTime, err := gcl.GetFileCreationTime(k)
|
||||
if err != nil {
|
||||
logger.Printf("(error: %v) initializing to current time.", err)
|
||||
logger.Printf(
|
||||
"(error: %v) initializing to current time.\n", err)
|
||||
}
|
||||
|
||||
url := gcl.ReposRequest(k.Repository.FullName)
|
||||
defaultBranch, err := gcl.GetDefaultBranch(url)
|
||||
if err != nil {
|
||||
logger.Printf(
|
||||
"(error: %v) setting default_branch to master\n", err)
|
||||
defaultBranch = "master"
|
||||
}
|
||||
|
||||
doc := doc.KustomizationDocument{
|
||||
DocumentData: string(data),
|
||||
FilePath: k.Path,
|
||||
RepositoryURL: k.Repository.URL,
|
||||
CreationTime: creationTime,
|
||||
Document: doc.Document{
|
||||
DocumentData: string(data),
|
||||
FilePath: k.Path,
|
||||
DefaultBranch: defaultBranch,
|
||||
RepositoryURL: k.Repository.URL,
|
||||
},
|
||||
}
|
||||
|
||||
return &doc, nil
|
||||
@@ -227,7 +310,34 @@ func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
return ioutil.ReadAll(resp.Body)
|
||||
data, err = ioutil.ReadAll(resp.Body)
|
||||
return data, err
|
||||
}
|
||||
|
||||
func (gcl GitHubClient) GetDefaultBranch(url string) (string, error) {
|
||||
resp, err := gcl.GetReposData(url)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"'%s' could not get default_branch: %v", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
data, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"could not read default_branch: %v", err)
|
||||
}
|
||||
|
||||
type defaultBranch struct {
|
||||
DefaultBranch string `json:"default_branch,omitempty"`
|
||||
}
|
||||
var branch defaultBranch
|
||||
err = json.Unmarshal(data, &branch)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf(
|
||||
"default_branch json malformed: %v", err)
|
||||
}
|
||||
|
||||
return branch.DefaultBranch, nil
|
||||
}
|
||||
|
||||
// GetFileCreationTime gets the earliest date of a file.
|
||||
@@ -301,29 +411,23 @@ func throttleRepoAPI() {
|
||||
<-contentRateTicker.C
|
||||
}
|
||||
|
||||
const (
|
||||
accessTokenKeyword = "access_token="
|
||||
perPageKeyword = "per_page="
|
||||
contentSearchURL = "https://api.github.com/repos"
|
||||
contentKeyword = "contents"
|
||||
)
|
||||
|
||||
type multiError []error
|
||||
|
||||
func (me multiError) Error() string {
|
||||
size := len(me) + 2
|
||||
strs := make([]string, size)
|
||||
strs[0] = "Errors [\n\t"
|
||||
strs[0] = "Errors ["
|
||||
for i, err := range me {
|
||||
strs[i+1] = err.Error()
|
||||
strs[i+1] = "\t" + err.Error()
|
||||
}
|
||||
strs[size-1] = "\n]"
|
||||
return strings.Join(strs, "\n\t")
|
||||
strs[size-1] = "]"
|
||||
return strings.Join(strs, "\n")
|
||||
}
|
||||
|
||||
type GithubFileSpec struct {
|
||||
Path string `json:"path,omitempty"`
|
||||
Repository struct {
|
||||
API string `json:"url,omitempty"`
|
||||
URL string `json:"html_url,omitempty"`
|
||||
FullName string `json:"full_name,omitempty"`
|
||||
} `json:"repository,omitempty"`
|
||||
|
||||
@@ -127,6 +127,11 @@ func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
|
||||
return rc.makeRequest(uri, Query{}).URL()
|
||||
}
|
||||
|
||||
func (rc RequestConfig) ReposRequest(fullRepoName string) string {
|
||||
uri := fmt.Sprintf("repos/%s", fullRepoName)
|
||||
return rc.makeRequest(uri, Query{}).URL()
|
||||
}
|
||||
|
||||
// CommitsRequest given the repo name, and a filepath returns a formatted query
|
||||
// for the Github API to find the commits that affect this file.
|
||||
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
|
||||
|
||||
@@ -1,5 +1,102 @@
|
||||
package github
|
||||
|
||||
// GitHub only returns at most 1000 results per search query,
|
||||
// this is problematic if you want to retrieve all the results for a given
|
||||
// search query. However, GitHub allows you to specify as much as you want per
|
||||
// query to make things more specific. Specifically for files, GitHub allows
|
||||
// you to specify their sizes with range queries. This is very convenient
|
||||
// since it allows us to split the search into disjoint sets/shards of results
|
||||
// from the different file size ranges.
|
||||
//
|
||||
// Some important factors to consider:
|
||||
//
|
||||
// - These queries are rate limited by the API to roughly once query every two
|
||||
// seconds.
|
||||
//
|
||||
// - The search space for file sizes is in bytes, from 0B to < 512KiB (this is
|
||||
// a huge search space that cannot be probed linearly in a timely manner if
|
||||
// granularity is to be expected).
|
||||
//
|
||||
// - If you have K files there will likely be ~K/1000 sets that you have find
|
||||
// from this search space in order to get all of the results.
|
||||
//
|
||||
// - If you have O(K) sets it is unlikely that they are all of the same size,
|
||||
// since (most files are power law distributed). That means that the range
|
||||
// might be significantly smaller for 1000 small files, than it is for
|
||||
// 1000 large files.
|
||||
//
|
||||
// - This method is a best effort approach. There are some limitations to what
|
||||
// it can and can't do, so please note the following:
|
||||
//
|
||||
// + There may very well be a filesize that has more than 1000 results.
|
||||
// this method cannot help in this case. However, requerying over time
|
||||
// (days/weeks/months) while sorting by last indexed values may be
|
||||
// sufficient to eventually get all of the results.
|
||||
//
|
||||
// + It's possible that the github API returns inconsistent counts. This
|
||||
// is problematic in most cases, since it can cause many issues if the
|
||||
// case is not handled properly. For instance, if you requested the
|
||||
// number of files of an interval from size:0..64 and get that there
|
||||
// are 900 results, you may query at size:0..96 and get that there
|
||||
// are 800 results. To guarantee that this approach completes and does
|
||||
// not get into a query loop over the same intervals, it will retry a few
|
||||
// times and take the largest of the results or the largest previously
|
||||
// queried value from another range (in this case, the implementation
|
||||
// could decide that size:0..96 must have 900) results. This makes the
|
||||
// approach best effort even if there are no single file sizes of over
|
||||
// 1000 results.
|
||||
//
|
||||
//
|
||||
// The approach that was taken to solve this problem is the following:
|
||||
//
|
||||
// 1. Determine the total number of results by querying from the lower bound
|
||||
// to the upper bound (size:0..max). If there are less than 1000 files,
|
||||
// return a single range of values (size:0..max) since all results can be
|
||||
// retrieved.
|
||||
//
|
||||
// 2. Otherwise, set a target number of files to be 1000.
|
||||
//
|
||||
// 3. Binary search for the range from 0..r that provides a file count that is
|
||||
// less than or equal to the target. Once this value is found, store the
|
||||
// upper bound of range (r). If r is the same as the previous value, (or 0)
|
||||
// increase r by one (this guarantees progress, but will miss out on some
|
||||
// results).
|
||||
//
|
||||
// 4. Increase the target by 1000.
|
||||
//
|
||||
// 5. Repeat steps 3 and 4 until the target is at or exceeds the total number
|
||||
// of files.
|
||||
//
|
||||
//
|
||||
// In general there are other ways to get all of the files from GitHub. In
|
||||
// some cases it would be sufficient to just get the files that are being
|
||||
// updated/indexed by github periodically to update the corpus, so this
|
||||
// complicated approach does not have to be run every time. However, for
|
||||
// some searches, there may be too many results on a time interval to do
|
||||
// this simple update search limited to only 1000 results.
|
||||
//
|
||||
// There is also a more sophisticated approach that may yield better
|
||||
// performance:
|
||||
// - Perform this search once and create a prior distribution of file sizes.
|
||||
// Each time you want to retrieve the results of the query, scale the
|
||||
// prior of expected ranges to the current number of files. From each
|
||||
// expected range of 1000 files, perform a exponential search to find the
|
||||
// lower bound of the range. This would likely reduce the total number
|
||||
// of queries by a significant amount since it would only have to search
|
||||
// for a small set of values around each likely range boundary.
|
||||
//
|
||||
// However, actually retrieving the files will be the bottleneck operation
|
||||
// since the number of queries to find the ranges will be close to:
|
||||
// log2(maxFileSize) * totalResults / 1000 ~= totalResults / 50
|
||||
// whereas the number of queries to actually get all of the search results
|
||||
// are close to:
|
||||
// apiCallsPerResult * 10(pages) * 100(resultsPerPage) * totalResults / 1000
|
||||
// = apiCallsPerResult * totalResults.
|
||||
//
|
||||
// So it could very well take apiCallsPerResult * 50 times longer to acutally
|
||||
// fetch the results (assuming the quotas for the API calls are the same as the
|
||||
// search API), than it does to perform these range searches.
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/bits"
|
||||
@@ -12,14 +109,20 @@ const (
|
||||
githubMaxResultsPerQuery = uint64(1000)
|
||||
)
|
||||
|
||||
// Interface for testing purposes. Not expecting to have multiple
|
||||
// implementations.
|
||||
// Interface instead of struct for testing purposes.
|
||||
// Not expecting to have multiple implementations.
|
||||
type cachedSearch interface {
|
||||
CountResults(uint64) (uint64, error)
|
||||
RequestString(filesize rangeFormatter) string
|
||||
}
|
||||
|
||||
// Cache uses bit tricks to be more efficient in detecting
|
||||
// cachedSearch is a simple data structure that maps the upper bound (r) of a
|
||||
// range from 0 to r to the number of files that have between 0 and r files
|
||||
// (inclusive). It also guarantees that the counts are monotonically increasing
|
||||
// (not strict) as the value for r increases, by looking at the maximal
|
||||
// previous file count for the value that precedes r in the cache.
|
||||
//
|
||||
// It uses a bit trick to be more efficient in detecting
|
||||
// inconsistencies in the returned data from the Github API.
|
||||
// Therefore, the cache expects a search to always start at 0, and
|
||||
// it expects the max file size to be a power of 2. If this is to be changed
|
||||
@@ -36,11 +139,12 @@ type cachedSearch interface {
|
||||
// problematic). The current cache implementation looks at the
|
||||
// predecessor entry to find out if the current value is monotonic.
|
||||
// This is where the bit trick is used, since each step in the binary
|
||||
// search is adding or ommiting to add a decreasing of 2 to the query value,
|
||||
// we can remove the least significant set bit to find the predecessor in
|
||||
// constant time. Ultimately since the search is rate limited, we could also
|
||||
// easily afford to compute this in linear time by iterating
|
||||
// over cached values.
|
||||
// search is adding or ommiting to add a decreasing power of 2 to the query
|
||||
// value, we can remove the least significant set bit to find the
|
||||
// predecessor in constant time. Ultimately since the search is rate
|
||||
// limited, we could also easily afford to compute this in linear time
|
||||
// by iterating over cached values. So this trick is not crucial to the
|
||||
// cache's performance.
|
||||
type githubCachedSearch struct {
|
||||
cache map[uint64]uint64
|
||||
gcl GitHubClient
|
||||
@@ -160,7 +264,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
||||
//
|
||||
// My intuition is that this approach is competitive to a perfectly
|
||||
// optimal solution, but I didn't actually take the time to do a
|
||||
// rigurous proof. Intuitively, since files sizes are typically power
|
||||
// rigorous proof. Intuitively, since files sizes are typically power
|
||||
// law distibuted the binary search will be very skewed towards the
|
||||
// smaller file ranges. This means that in practice this approach will
|
||||
// make fewer than (#files/1000)*(log(n) = 19) queries for
|
||||
|
||||
Reference in New Issue
Block a user