From c02b4f3a119e1934622773ef25f80dae6249ae60 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 9 Jul 2019 13:04:10 -0700 Subject: [PATCH 1/8] Initial (temporary) implementation of search doc. Document describing how to convert a kustomization file into a searchable document on appengine (will be changed to elasticsearch) soon. --- internal/search/doc/doc.go | 169 ++++++++++++++++++++++++++++++++ internal/search/doc/doc_test.go | 153 +++++++++++++++++++++++++++++ internal/search/go.mod | 9 ++ internal/search/go.sum | 24 +++++ travis/pre-commit.sh | 1 + 5 files changed, 356 insertions(+) create mode 100644 internal/search/doc/doc.go create mode 100644 internal/search/doc/doc_test.go create mode 100644 internal/search/go.mod create mode 100644 internal/search/go.sum diff --git a/internal/search/doc/doc.go b/internal/search/doc/doc.go new file mode 100644 index 000000000..5a87c184d --- /dev/null +++ b/internal/search/doc/doc.go @@ -0,0 +1,169 @@ +package doc + +import ( + "fmt" + "strings" + "time" + + "sigs.k8s.io/yaml" + + "google.golang.org/appengine/search" +) + +const ( + identifierStr = "identifier" + documentStr = "document" + repoURLStr = "repo_url" + filePathStr = "file_path" + creationTimeStr = "creation_time" +) + +// Represents an unbreakable character stream. +type Atom = search.Atom + +// Implements search.FieldLoadSaver in order to index this representation of a kustomization.yaml +// file. +type KustomizationDocument struct { + identifiers []Atom + FilePath Atom + RepositoryURL Atom + DocumentData string + CreationTime time.Time +} + +// Partially implements search.FieldLoadSaver. +func (k *KustomizationDocument) Load(fields []search.Field, metadata *search.DocumentMetadata) error { + k.identifiers = make([]search.Atom, 0) + wrongTypeError := func(name string, expected interface{}, actual interface{}) error { + return fmt.Errorf("%s expects type %T, found %#v", name, expected, actual) + } + + for _, f := range fields { + switch f.Name { + case identifierStr: + identifier, ok := f.Value.(search.Atom) + if !ok { + return wrongTypeError(f.Name, identifier, f.Value) + } + k.identifiers = append(k.identifiers, identifier) + + case documentStr: + document, ok := f.Value.(string) + if !ok { + return wrongTypeError(f.Name, document, f.Value) + } + k.DocumentData = document + + case filePathStr: + fp, ok := f.Value.(search.Atom) + if !ok { + return wrongTypeError(f.Name, fp, f.Value) + } + k.FilePath = fp + + case repoURLStr: + url, ok := f.Value.(search.Atom) + if !ok { + return wrongTypeError(f.Name, url, f.Value) + } + k.RepositoryURL = url + + case creationTimeStr: + time, ok := f.Value.(time.Time) + if !ok { + return wrongTypeError(f.Name, time, f.Value) + } + k.CreationTime = time + default: + return fmt.Errorf("KustomizationDocument field %s not recognized", f.Name) + } + } + + return nil +} + +// Partially implements search.FieldLoadSaver. +func (k *KustomizationDocument) Save() ([]search.Field, *search.DocumentMetadata, error) { + err := k.ParseYAML() + if err != nil { + return nil, nil, err + } + + extraFields := []search.Field{ + {Name: documentStr, Value: k.DocumentData}, + {Name: filePathStr, Value: k.FilePath}, + {Name: repoURLStr, Value: k.RepositoryURL}, + {Name: creationTimeStr, Value: k.CreationTime}, + } + + fields := make([]search.Field, 0, len(k.identifiers)+len(extraFields)) + for _, identifier := range k.identifiers { + fields = append(fields, search.Field{Name: identifierStr, Value: identifier}) + } + fields = append(fields, extraFields...) + + return fields, nil, nil +} + +func (k *KustomizationDocument) ParseYAML() error { + k.identifiers = make([]Atom, 0) + + var kustomization map[string]interface{} + err := yaml.Unmarshal([]byte(k.DocumentData), &kustomization) + if err != nil { + return fmt.Errorf("unable to parse kustomization file: %s", err) + } + + type Map struct { + data map[string]interface{} + prefix Atom + } + + toVisit := []Map{ + { + data: kustomization, + prefix: "", + }, + } + + atomJoin := func(vals ...interface{}) Atom { + strs := make([]string, 0, len(vals)) + for _, val := range vals { + strs = append(strs, fmt.Sprint(val)) + } + return Atom(strings.Trim(strings.Join(strs, " "), " ")) + } + + set := make(map[Atom]struct{}) + + for i := 0; i < len(toVisit); i++ { + visiting := toVisit[i] + for k, v := range visiting.data { + set[atomJoin(visiting.prefix, k)] = struct{}{} + switch value := v.(type) { + case map[string]interface{}: + toVisit = append(toVisit, Map{ + data: value, + prefix: atomJoin(visiting.prefix, fmt.Sprint(k)), + }) + case []interface{}: + for _, val := range value { + submap, ok := val.(map[string]interface{}) + if !ok { + continue + } + toVisit = append(toVisit, Map{ + data: submap, + prefix: atomJoin(visiting.prefix, fmt.Sprint(k)), + }) + } + } + } + } + + for key := range set { + k.identifiers = append(k.identifiers, key) + } + + return nil +} diff --git a/internal/search/doc/doc_test.go b/internal/search/doc/doc_test.go new file mode 100644 index 000000000..a28ef6814 --- /dev/null +++ b/internal/search/doc/doc_test.go @@ -0,0 +1,153 @@ +package doc + +import ( + "fmt" + "reflect" + "sort" + "strings" + "testing" + "time" + + "google.golang.org/appengine/search" +) + +func TestLoadFailures(t *testing.T) { + type sentinelType struct{} + sentinel := sentinelType{} + + testCases := [][]search.Field{ + {{Name: identifierStr, Value: sentinel}}, + {{Name: documentStr, Value: sentinel}}, + {{Name: repoURLStr, Value: sentinel}}, + {{Name: filePathStr, Value: sentinel}}, + {{Name: creationTimeStr, Value: sentinel}}, + } + + for _, test := range testCases { + var k KustomizationDocument + err := k.Load(test, nil) + if err == nil { + t.Errorf("Type missmatch %#v should not be loadable", test) + } + } +} + +func TestFieldLoadSaver(t *testing.T) { + + commonTestCases := []KustomizationDocument{ + { + identifiers: []Atom{"namePrefix", "metadata.name", "kind"}, + FilePath: "some/path/kustomization.yaml", + RepositoryURL: "https://example.com/kustomize", + CreationTime: time.Now(), + DocumentData: ` +namePrefix: dev- +metadata: + name: app +kind: Deployment +`, + }, + } + + for _, test := range commonTestCases { + fields, metadata, err := test.Save() + if err != nil { + t.Errorf("Error calling Save(): %s\n", err) + } + doc := KustomizationDocument{} + err = doc.Load(fields, metadata) + if err != nil { + t.Errorf("Doc failed to load: %s\n", err) + } + if !reflect.DeepEqual(test, doc) { + t.Errorf("Expected loaded document (%+v) to be equal to (%+v)\n", doc, test) + } + } +} + +func TestParseYAML(t *testing.T) { + testCases := []struct { + identifiers []Atom + yaml string + }{ + { + identifiers: []Atom{ + "namePrefix", + "metadata", + "metadata name", + "kind", + }, + yaml: ` +namePrefix: dev- +metadata: + name: app +kind: Deployment +`, + }, + { + identifiers: []Atom{ + "namePrefix", + "metadata", + "metadata name", + "metadata spec", + "metadata spec replicas", + "kind", + "replicas", + "replicas name", + "replicas count", + "resource", + }, + yaml: ` +namePrefix: dev- +# map of map +metadata: + name: n1 + spec: + replicas: 3 +kind: Deployment + +#list of map +replicas: +- name: n1 + count: 3 +- name: n2 + count: 3 + +# list +resource: +- file1.yaml +- file2.yaml +`, + }, + } + + atomStrs := func(atoms []Atom) []string { + strs := make([]string, 0, len(atoms)) + for _, val := range atoms { + strs = append(strs, fmt.Sprintf("%v", val)) + } + return strs + } + + for _, test := range testCases { + doc := KustomizationDocument{ + DocumentData: test.yaml, + FilePath: "example/path/kustomization.yaml", + } + + err := doc.ParseYAML() + if err != nil { + t.Errorf("Document error error: %s", err) + } + + docIDs := atomStrs(doc.identifiers) + expectedIDs := atomStrs(test.identifiers) + sort.Strings(docIDs) + sort.Strings(expectedIDs) + + if !reflect.DeepEqual(docIDs, expectedIDs) { + t.Errorf("Expected loaded document (%v) to be equal to (%v)\n", + strings.Join(docIDs, ","), strings.Join(expectedIDs, ",")) + } + } +} diff --git a/internal/search/go.mod b/internal/search/go.mod new file mode 100644 index 000000000..7bde01290 --- /dev/null +++ b/internal/search/go.mod @@ -0,0 +1,9 @@ +module sigs.k8s.io/kustomize/internal/search + +go 1.12 + +require ( + google.golang.org/appengine v1.6.1 + gopkg.in/yaml.v2 v2.2.2 // indirect + sigs.k8s.io/yaml v1.1.0 +) diff --git a/internal/search/go.sum b/internal/search/go.sum new file mode 100644 index 000000000..a93f1159e --- /dev/null +++ b/internal/search/go.sum @@ -0,0 +1,24 @@ +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0Ola2caSKcY69NUBZrRQ= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= +sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= diff --git a/travis/pre-commit.sh b/travis/pre-commit.sh index ba34d5723..8d55f2b2b 100755 --- a/travis/pre-commit.sh +++ b/travis/pre-commit.sh @@ -30,6 +30,7 @@ function testGoLangCILint { function testGoTest { go test -v ./... + (cd ./internal/search; go test -v ./...) } # These tests require the helm program, and at the moment From ca41674df318191f1ac4e044811bad2040beff0e Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 9 Jul 2019 13:04:10 -0700 Subject: [PATCH 2/8] Implementation of basic crawler organisation. `crawler.Crawler` interface is defined, where a crawler has to implement a `Crawl` method that forwards document found by the crawler to a channel. A helper function that launches a list of crawlers concurrently and merges their channels into one main output channel, forwarding errors is also implemented. Finally, a test that verifies the correctness and concurrency of the helper method is provided. --- internal/search/crawler/crawler.go | 76 +++++++++++++++ internal/search/crawler/crawler_test.go | 124 ++++++++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 internal/search/crawler/crawler.go create mode 100644 internal/search/crawler/crawler_test.go diff --git a/internal/search/crawler/crawler.go b/internal/search/crawler/crawler.go new file mode 100644 index 000000000..499da8dbe --- /dev/null +++ b/internal/search/crawler/crawler.go @@ -0,0 +1,76 @@ +// Package crawler provides helper methods and defines an interface for lauching +// source repository crawlers that retrieve files from a source and forwards +// to a channel for indexing and retrieval. +package crawler + +import ( + "context" + "fmt" + "sync" + + "sigs.k8s.io/kustomize/internal/search/doc" +) + +// Crawler forwards documents from source repositories to index and store them +// for searching. Each crawler is responsible for querying it's source of +// information, and forwarding files that have not been seen before or that need +// updating. +type Crawler interface { + // Crawl returns when it is done processing. This method does not take + // ownership of the channel. The channel is write only, and it + // designates where the crawler should forward the documents. + Crawl(ctx context.Context, output chan<- *doc.KustomizationDocument) error +} + +// CrawlerRunner is a blocking function and only returns once all of the +// crawlers are finished with execution. +// +// This function uses the output channel to forward kustomization documents +// from a list of crawlers. The output is to be consumed by a database/search +// indexer for later retrieval. +// +// The return value is an array of errors in which each index represents the +// index of the crawler that emitted the error. Although the errors themselves +// can be nil, the array will always be exactly the size of the crawlers array. +func CrawlerRunner(ctx context.Context, + output chan<- *doc.KustomizationDocument, crawlers []Crawler) []error { + + errs := make([]error, len(crawlers)) + wg := sync.WaitGroup{} + + for i, crawler := range crawlers { + // Crawler implementations get their own channels to prevent a + // crawler from closing the main output channel. + docs := make(chan *doc.KustomizationDocument) + wg.Add(2) + + // Forward all of the documents from this crawler's channel to + // the main output channel. + go func(docs <-chan *doc.KustomizationDocument) { + defer wg.Done() + for doc := range docs { + output <- doc + } + }(docs) + + // Run this crawler and capture its returned error. + go func(idx int, crawler Crawler, + docs chan<- *doc.KustomizationDocument) { + + defer func() { + wg.Done() + if r := recover(); r != nil { + errs[idx] = fmt.Errorf( + "%+v panicked: %v, additional error %v", + crawler, r, errs[idx], + ) + } + }() + defer close(docs) + errs[idx] = crawler.Crawl(ctx, docs) + }(i, crawler, docs) // Copies the index and the crawler + } + + wg.Wait() + return errs +} diff --git a/internal/search/crawler/crawler_test.go b/internal/search/crawler/crawler_test.go new file mode 100644 index 000000000..b8a6af86d --- /dev/null +++ b/internal/search/crawler/crawler_test.go @@ -0,0 +1,124 @@ +package crawler + +import ( + "context" + "errors" + "reflect" + "sort" + "sync" + "testing" + + "sigs.k8s.io/kustomize/internal/search/doc" +) + +// Simple crawler that forwards it's list of documents to a provided channel and +// returns it's error to the caller. +type testCrawler struct { + docs []doc.KustomizationDocument + err error +} + +// Crawl implements the Crawler interface for testing. +func (c testCrawler) Crawl(ctx context.Context, + output chan<- *doc.KustomizationDocument) error { + + for i := range c.docs { + output <- &c.docs[i] + } + return c.err +} + +// Used to make sure that we're comparing documents in order. This is needed +// since these documents will be sent concurrently. +type sortableDocs []doc.KustomizationDocument + +func (s sortableDocs) Less(i, j int) bool { + return s[i].FilePath < s[j].FilePath +} + +func (s sortableDocs) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortableDocs) Len() int { + return len(s) +} + +func TestCrawlerRunner(t *testing.T) { + tests := []struct { + tc []Crawler + errs []error + docs sortableDocs + }{ + { + tc: []Crawler{ + testCrawler{ + docs: []doc.KustomizationDocument{ + {FilePath: "crawler1/doc1"}, + {FilePath: "crawler1/doc2"}, + {FilePath: "crawler1/doc3"}, + }, + }, + testCrawler{err: errors.New("crawler2")}, + testCrawler{}, + testCrawler{ + docs: []doc.KustomizationDocument{ + {FilePath: "crawler4/doc1"}, + {FilePath: "crawler4/doc2"}, + }, + err: errors.New("crawler4"), + }, + }, + errs: []error{ + nil, + errors.New("crawler2"), + nil, + errors.New("crawler4"), + }, + docs: sortableDocs{ + {FilePath: "crawler1/doc1"}, + {FilePath: "crawler1/doc2"}, + {FilePath: "crawler1/doc3"}, + {FilePath: "crawler4/doc1"}, + {FilePath: "crawler4/doc2"}, + }, + }, + } + + for _, test := range tests { + output := make(chan *doc.KustomizationDocument) + wg := sync.WaitGroup{} + wg.Add(1) + + // Run the Crawler runner with a list of crawlers. + go func() { + defer close(output) + defer wg.Done() + + errs := CrawlerRunner(context.Background(), output, + test.tc) + + // Check that errors are returned as they should be. + if !reflect.DeepEqual(errs, test.errs) { + t.Errorf("Expected errs (%v) to equal (%v)", + errs, test.errs) + } + + }() + + // Iterate over the output channel of Crawler runner. + returned := make(sortableDocs, 0, len(test.docs)) + for doc := range output { + returned = append(returned, *doc) + } + + // Check that all documents are received. + sort.Sort(returned) + if !reflect.DeepEqual(returned, test.docs) { + t.Errorf("Expected docs (%v) to equal (%v)\n", + returned, test.docs) + } + + wg.Wait() + } +} From ac6918d70fe20b6937ae6a79a0840b8116118142 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 9 Jul 2019 13:30:12 -0700 Subject: [PATCH 3/8] Implementation of github query helper library. To make this easier to read, use, and modify, I've abstracted the important parts of the github query api into crawler/github/query.go which allows to describe at a high level what is to be searched without knowing the API syntax. --- internal/search/crawler/github/queries.go | 229 ++++++++++++++++++ .../search/crawler/github/queries_test.go | 119 +++++++++ 2 files changed, 348 insertions(+) create mode 100644 internal/search/crawler/github/queries.go create mode 100644 internal/search/crawler/github/queries_test.go diff --git a/internal/search/crawler/github/queries.go b/internal/search/crawler/github/queries.go new file mode 100644 index 000000000..7dc307914 --- /dev/null +++ b/internal/search/crawler/github/queries.go @@ -0,0 +1,229 @@ +package github + +import ( + "fmt" + "net/url" + "strings" +) + +const ( + perPageArg = "per_page" + accessTokenArg = "access_token" + + githubMaxPageSize = 100 +) + +// Implementation detail, not important to external API. +type queryField struct { + name string + value interface{} +} + +// Formats a query field. +func (qf queryField) String() string { + var value string + switch v := qf.value.(type) { + case string: + value = v + case rangeFormatter: + value = v.RangeString() + default: + value = fmt.Sprint(v) + } + + if qf.name == "" { + return value + } + return fmt.Sprint(qf.name, ":", value) +} + +// Example of formating a query: +// QueryWith( +// Filename("kustomization.yaml"), +// Filesize(RangeWithin{64, 192}), +// Keyword("copyright"), +// Keyword("2019"), +// ).String() +// +// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which +// would search for files that have [64, 192] bytes (inclusive range) and that +// contain the keywords 'copyright' and '2019' somewhere in the file. +type Query []queryField + +func QueryWith(qfs ...queryField) Query { + return Query(qfs) +} + +func (q Query) String() string { + strs := make([]string, 0, len(q)) + for _, elem := range q { + str := elem.String() + if str == "" { + continue + } + strs = append(strs, str) + } + + query := strings.Join(strs, "+") + if query == "" { + return query + } + return "q=" + query +} + +// Keyword takes a single word, and formats it according to the Github API. +func Keyword(k string) queryField { + return queryField{value: k} +} + +// Filesize takes a rangeFormatter and formats it according to the Github API. +func Filesize(r rangeFormatter) queryField { + return queryField{name: "size", value: r} +} + +// Filename takes a filename and formats it according to the Github API. +func Filename(f string) queryField { + return queryField{name: "filename", value: f} +} + +// Path takes a filepath and formats it according to the Github API. +func Path(p string) queryField { + return queryField{name: "path", value: p} +} + +// RequestConfig stores common variables that must be present for the queries. +// - CodeSearchRequests: ask Github to check the code indices given a query. +// - ContentsRequests: ask Github where to download a resource given a repo and a +// file path. +// - CommitsRequests: asks Github to list commits made one a file. Useful to +// determine the date of a file. +type RequestConfig struct { + perPage uint64 + retryCount uint64 + accessToken string +} + +func NewRequestConfig( + perPage, retryCount uint64, accessToken string) RequestConfig { + + return RequestConfig{ + perPage: perPage, + retryCount: retryCount, + accessToken: accessToken, + } +} + +// CodeSearchRequestWith given a list of query parameters that specify the +// (patial) query, returns a request object with the (parital) query. Must call +// the URL method to get the string value of the URL. See request.CopyWith, to +// understand why the request object is useful. +func (rc RequestConfig) CodeSearchRequestWith(query Query) request { + req := rc.makeRequest("search/code", query) + req.vals.Set("sort", "indexed") + req.vals.Set("order", "desc") + return req +} + +// ContentsRequest given the repo name, and the filepath returns a formatted +// query for the Github API to find the dowload information of this filepath. +func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path) + return rc.makeRequest(uri, Query{}).URL() +} + +// CommitsRequest given the repo name, and a filepath returns a formatted query +// for the Github API to find the commits that affect this file. +func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/commits", fullRepoName) + return rc.makeRequest(uri, Query{Path(path)}).URL() +} + +// How many times to retry the queries before giving up (used by the crawler, +// not Github). +func (rc RequestConfig) RetryCount() uint64 { + return rc.retryCount +} + +func (rc RequestConfig) makeRequest(path string, query Query) request { + vals := url.Values{} + if rc.accessToken != "" { + vals.Set(accessTokenArg, rc.accessToken) + } + vals.Set(perPageArg, fmt.Sprint(rc.perPage)) + + return request{ + url: url.URL{ + Scheme: "https", + Host: "api.github.com", + Path: path, + }, + vals: vals, + query: query, + } +} + +type request struct { + url url.URL + vals url.Values + query Query +} + +// CopyWith copies the requests and adds the extra query parameters. Usefull +// for dynamically adding sizes to a filename only query without modifying it. +func (r request) CopyWith(queryParams ...queryField) request { + cpy := r + cpy.query = append(cpy.query, queryParams...) + return cpy +} + +// URL encodes the variables and the URL representation into a string. +func (r request) URL() string { + // Github does not handle URL encoding properly in its API for the + // q='...', so the query parameter is added without any encoding + // manually. + encoded := r.vals.Encode() + query := r.query.String() + sep := "&" + if query == "" { + sep = "" + } + if encoded == "" && query != "" { + sep = "?" + } + r.url.RawQuery = encoded + sep + query + return r.url.String() +} + +// Allows to define a range of numbers and print it in the github range +// query format https://help.github.com/en/articles/understanding-the-search-syntax. +type rangeFormatter interface { + RangeString() string +} + +// RangeLessThan is a range of values strictly less than (<) size. +type RangeLessThan struct { + size uint64 +} + +func (r RangeLessThan) RangeString() string { + return fmt.Sprintf("<%d", r.size) +} + +// RangeLessThan is a range of values strictly greater than (>) size. +type RangeGreaterThan struct { + size uint64 +} + +func (r RangeGreaterThan) RangeString() string { + return fmt.Sprintf(">%d", r.size) +} + +// RangeWithin is an inclusive range from start to end. +type RangeWithin struct { + start uint64 + end uint64 +} + +func (r RangeWithin) RangeString() string { + return fmt.Sprintf("%d..%d", r.start, r.end) +} diff --git a/internal/search/crawler/github/queries_test.go b/internal/search/crawler/github/queries_test.go new file mode 100644 index 000000000..98ef7d564 --- /dev/null +++ b/internal/search/crawler/github/queries_test.go @@ -0,0 +1,119 @@ +package github + +import ( + "testing" +) + +func TestQueryFields(t *testing.T) { + testCases := []struct { + formatter queryField + expected string + }{ + { + formatter: Keyword("keyword"), + expected: "keyword", + }, + { + formatter: Filesize(RangeLessThan{23}), + expected: "size:<23", + }, + { + formatter: Filesize(RangeWithin{24, 64}), + expected: "size:24..64", + }, + { + formatter: Filesize(RangeGreaterThan{64}), + expected: "size:>64", + }, + { + formatter: Path("some/path/to/file"), + expected: "path:some/path/to/file", + }, + { + formatter: Filename("kustomization.yaml"), + expected: "filename:kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.formatter.String(); result != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected) + } + } +} + +func TestQueryType(t *testing.T) { + testCases := []struct { + query Query + expected string + }{ + { + query: QueryWith( + Filesize(RangeWithin{24, 64}), + Filename("kustomization.yaml"), + Keyword("keyword1"), + Keyword("keyword2"), + ), + expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", + }, + } + + for _, test := range testCases { + if queryStr := test.query.String(); queryStr != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected) + } + + } +} + +func TestGithubSearchQuery(t *testing.T) { + const ( + accessToken = "random_token" + perPage = 100 + ) + + testCases := []struct { + rc RequestConfig + codeQuery Query + fullRepoName string + path string + expectedCodeQuery string + expectedContentsQuery string + expectedCommitsQuery string + }{ + { + rc: RequestConfig{ + perPage: perPage, + accessToken: accessToken, + }, + codeQuery: Query{ + Filename("kustomization.yaml"), + Filesize(RangeWithin{64, 128}), + }, + fullRepoName: "kubernetes-sigs/kustomize", + path: "examples/helloWorld/kustomization.yaml", + + expectedCodeQuery: "https://api.github.com/search/code?" + + "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + + expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + + "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + + expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + + "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery { + t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery) + } + + if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery { + t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery) + } + if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery { + t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery) + } + } +} From 62edcae233b50265a33ea3fe3005540fe02d8ea9 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 9 Jul 2019 15:15:19 -0700 Subject: [PATCH 4/8] Implementation of configurable github crawler. Currently I've left the search splitting by file size out of this commit since it's ~200 lines of logic, and I think it's best to get it reviewed separately. In it's current state the crawler would only be able to get the last 1000 indexed files by Github, but it does show the general structure of how the crawler is implemented. --- internal/search/crawler/github/crawler.go | 421 ++++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 internal/search/crawler/github/crawler.go diff --git a/internal/search/crawler/github/crawler.go b/internal/search/crawler/github/crawler.go new file mode 100644 index 000000000..2c0ea9ed9 --- /dev/null +++ b/internal/search/crawler/github/crawler.go @@ -0,0 +1,421 @@ +// Package github implements the crawler.Crawler interface, getting data +// from the Github search API. +package github + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "math" + "net/http" + "os" + "regexp" + "strconv" + "strings" + "time" + + "sigs.k8s.io/kustomize/internal/search/doc" +) + +var logger = log.New(os.Stdout, "Github Crawler: ", + log.LstdFlags|log.LUTC|log.Llongfile) + +// Implements crawler.Crawler. +type githubCrawler struct { + rc RequestConfig + query Query +} + +func NewCrawler( + accessToken string, retryCount uint64, query Query) githubCrawler { + + return githubCrawler{ + rc: RequestConfig{ + perPage: githubMaxPageSize, + retryCount: retryCount, + accessToken: accessToken, + }, + query: query, + } +} + +// Implements crawler.Crawler. +func (gc githubCrawler) Crawl( + ctx context.Context, output chan<- *doc.KustomizationDocument) error { + + // Range finding will be added in the next PR to make this one + // simpler/shorter. It would return multiple search queries for the + // Github API such that all documents can be retrieved. This is + // required since Github returns a max of 1000 results per query, so + // multiple queries that split the search space into chunks of 1000 + // kustomization files is required. + ranges := []string{gc.query.String()} + + // Query each range for files. + errs := make(multiError, 0) + for _, query := range ranges { + err := processQuery(ctx, gc.rc, query, output) + if err != nil { + errs = append(errs, err) + } + } + + return errs +} + +// processQuery follows all of the pages in a query, and updates/adds the +// documents from the crawl to the datastore/index. +func processQuery(ctx context.Context, rc RequestConfig, query string, + output chan<- *doc.KustomizationDocument) error { + + queryPages := make(chan GithubResponseInfo) + + go func() { + // Forward the document metadata to the retrieval channel. + // This separation allows for concurrent requests for the code + // search, and the retrieval portions of the API. + err := ForwardPaginatedQuery( + ctx, query, rc.retryCount, queryPages) + if err != nil { + // TODO(damienr74) handle this error with redis? + logger.Println(err) + } + close(queryPages) + }() + + errs := make(multiError, 0) + for page := range queryPages { + if page.Error != nil { + errs = append(errs, page.Error) + continue + } + + for _, file := range page.Parsed.Items { + // TODO(damienr74) This is where we'd need to + // communicate with redis. Currently always doing a full + // reindex of the documents. Since the documents are in + // sorted order in each bucket, we can short circuit the + // search when we find a file that has been seen, or we + // can choose to selectively update files. + + k, err := kustomizationResultAdapter(rc, file) + if err != nil { + errs = append(errs, err) + continue + } + output <- k + } + } + + return errs +} + +func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( + *doc.KustomizationDocument, error) { + + data, err := GetFileData(rc, k) + if err != nil { + return nil, err + } + + creationTime, err := GetFileCreationTime(rc, k) + if err != nil { + logger.Printf("(Error: %v) initializing to current time.", err) + } + + doc := doc.KustomizationDocument{ + DocumentData: string(data), + FilePath: doc.Atom(k.Path), + RepositoryURL: doc.Atom(k.Repository.URL), + CreationTime: creationTime, + } + + return &doc, nil +} + +// ForwardPaginatedQuery follows the links to the next pages and performs all of +// the queries for a given search query, relaying the data from each request +// back to an output channel. +func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, + output chan<- GithubResponseInfo) error { + + response := parseGithubResponse(query, retryCount) + if response.Error != nil { + return response.Error + } + + output <- response + + for response.LastURL != "" && response.NextURL != "" { + select { + case <-ctx.Done(): + return nil + default: + response = parseGithubResponse(response.NextURL, retryCount) + if response.Error != nil { + return response.Error + } + + output <- response + } + } + + return nil +} + +// GetFileData gets the bytes from a file. +func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { + + url := rc.ContentsRequest(k.Repository.FullName, k.Path) + + logger.Println("content-url ", url) + resp, err := GetReposData(url, rc.RetryCount()) + if err != nil { + return nil, err + } + + data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + type githubContentRawURL struct { + DownloadURL string `json:"download_url,omitempty"` + } + var rawURL githubContentRawURL + err = json.Unmarshal(data, &rawURL) + if err != nil { + return nil, err + } + + logger.Println("raw-data-url", rawURL.DownloadURL) + resp, err = GetReposData(rawURL.DownloadURL, rc.RetryCount()) + if err != nil { + return nil, err + } + + return ioutil.ReadAll(resp.Body) +} + +// GetFileCreationTime gets the earliest date of a file. +func GetFileCreationTime( + rc RequestConfig, k GithubFileSpec) (time.Time, error) { + + url := rc.CommitsRequest(k.Repository.FullName, k.Path) + + defaultTime := time.Now() + + logger.Println("commits-url", url) + resp, err := GetReposData(url, rc.RetryCount()) + if err != nil { + return defaultTime, err + } + + type DateSpec struct { + Commit struct { + Author struct { + Date string `json:"date,omitempty"` + } `json:"author,omitempty"` + } `json:"commit,omitempty"` + } + + _, lastURL := parseGithubLinkFormat(resp.Header.Get("link")) + if lastURL != "" { + resp, err = GetReposData(lastURL, rc.RetryCount()) + if err != nil { + return defaultTime, err + } + } + + data, err := ioutil.ReadAll(resp.Body) + earliestDate := []DateSpec{} + err = json.Unmarshal(data, &earliestDate) + size := len(earliestDate) + if err != nil || size == 0 { + return defaultTime, err + } + + return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date) +} + +// TODO(damienr74) change the tickers to actually check api rate limits, reset +// times, and throttle requests dynamically based off of current utilization, +// instead of hardcoding the documented values, these calls are not quota'd. +// +// See https://developer.github.com/v3/rate_limit/ for details. +var ( + searchRateTicker = time.NewTicker(time.Second * 2) + contentRateTicker = time.NewTicker(time.Second * 1) +) + +func throttleSearchAPI() { + <-searchRateTicker.C +} + +func throttleRepoAPI() { + <-contentRateTicker.C +} + +const ( + accessTokenKeyword = "access_token=" + perPageKeyword = "per_page=" + contentSearchURL = "https://api.github.com/repos" + contentKeyword = "contents" +) + +type multiError []error + +func (me multiError) Error() string { + size := len(me) + 2 + strs := make([]string, size) + strs[0] = "Errors [\n\t" + for i, err := range me { + strs[i+1] = err.Error() + } + strs[size-1] = "\n]" + return strings.Join(strs, "\n\t") +} + +type GithubFileSpec struct { + Path string `json:"path,omitempty"` + Repository struct { + URL string `json:"html_url,omitempty"` + FullName string `json:"full_name,omitempty"` + } `json:"repository,omitempty"` +} + +type githubResponse struct { + // MaxUint is reserved as a sentinel value. + // This is the number of files that match the query. + TotalCount uint64 `json:"total_count,omitempty"` + + // Github representation of a file. + Items []GithubFileSpec `json:"items,omitempty"` +} + +type GithubResponseInfo struct { + *http.Response + Parsed *githubResponse + Error error + NextURL string + LastURL string +} + +func parseGithubLinkFormat(links string) (string, string) { + const ( + linkNext = "next" + linkLast = "last" + linkInfoURL = 1 + linkInfoRel = 2 + ) + + next, last := "", "" + linkInfo := regexp.MustCompile(`<(.*)>.*; rel="(last|next)"`) + + for _, link := range strings.Split(links, ",") { + linkParse := linkInfo.FindStringSubmatch(link) + if len(linkParse) != 3 { + continue + } + + url := linkParse[linkInfoURL] + switch linkParse[linkInfoRel] { + case linkNext: + next = url + case linkLast: + last = url + default: + } + } + + return next, last +} + +func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInfo { + resp, err := SearchGithubAPI(getRequest, retryCount) + requestInfo := GithubResponseInfo{ + Response: resp, + Error: err, + Parsed: nil, + } + + if err != nil || resp == nil { + return requestInfo + } + + var data []byte + data, requestInfo.Error = ioutil.ReadAll(resp.Body) + if requestInfo.Error != nil { + return requestInfo + } + + if resp.StatusCode != http.StatusOK { + logger.Println("Query: ", getRequest) + logger.Println("Status not OK at the source") + logger.Println("Header Dump", resp.Header) + logger.Println("Body Dump", string(data)) + requestInfo.Error = fmt.Errorf("Request Rejected, Status '%s'", + resp.Status) + return requestInfo + } + + requestInfo.NextURL, requestInfo.LastURL = + parseGithubLinkFormat(resp.Header.Get("link")) + + resultCount := githubResponse{ + TotalCount: math.MaxUint64, + } + requestInfo.Error = json.Unmarshal(data, &resultCount) + if requestInfo.Error != nil { + return requestInfo + } + + requestInfo.Parsed = &resultCount + + return requestInfo + +} + +// SearchGithubAPI performs a search query and handles rate limitting for +// the 'code/search?' endpoint as well as timed retries in the case of abuse +// prevention. +func SearchGithubAPI(query string, retryCount uint64) (*http.Response, error) { + throttleSearchAPI() + return getWithRetry(query, retryCount) +} + +// GetReposData performs a search query and handles rate limitting for +// the '/repos' endpoint as well as timed retries in the case of abuse +// prevention. +func GetReposData(query string, retryCount uint64) (*http.Response, error) { + throttleRepoAPI() + return getWithRetry(query, retryCount) +} + +func getWithRetry( + query string, retryCount uint64) (resp *http.Response, err error) { + + resp, err = http.Get(query) + + for err == nil && + resp.StatusCode == http.StatusForbidden && + retryCount > 0 { + + retryTime := resp.Header.Get("Retry-After") + i, err := strconv.Atoi(retryTime) + if err != nil { + return resp, fmt.Errorf("Forbidden without 'Retry-After'") + } + logger.Printf( + "Status Forbidden, retring %d more times\n", retryCount) + + logger.Printf("Waiting %d seconds before retrying\n", i) + time.Sleep(time.Second * time.Duration(i)) + retryCount-- + resp, err = http.Get(query) + } + + return resp, err +} From e0d388c6f721ec33e8041d466b464ce7f40ae0c3 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Thu, 11 Jul 2019 11:49:58 -0700 Subject: [PATCH 5/8] Implements search query partitioning by filesize. Binary searches through different ranges of file sizes to create search queries with fewer than 1000 results. This is required since github will only return the first 1000 result of any search query. The implementation handles the case where some files may be deleted while the search is running, and (possibly artificially) assures that the number of files increases monotonically as the filesize range grows. The implementation also caches queries and is expected to make fewer than O((#files/1000) * lg(max file size)) API calls to retrieve the range queries that can be used to index all of the files. In practice running the search splitting takes a few minutes, while retrieving all of the data takes a few hours. --- internal/search/crawler/github/crawler.go | 15 +- .../crawler/github/split_search_ranges.go | 274 ++++++++++++++++++ .../github/split_search_ranges_test.go | 90 ++++++ 3 files changed, 372 insertions(+), 7 deletions(-) create mode 100644 internal/search/crawler/github/split_search_ranges.go create mode 100644 internal/search/crawler/github/split_search_ranges_test.go diff --git a/internal/search/crawler/github/crawler.go b/internal/search/crawler/github/crawler.go index 2c0ea9ed9..8c7c81f57 100644 --- a/internal/search/crawler/github/crawler.go +++ b/internal/search/crawler/github/crawler.go @@ -45,13 +45,14 @@ func NewCrawler( func (gc githubCrawler) Crawl( ctx context.Context, output chan<- *doc.KustomizationDocument) error { - // Range finding will be added in the next PR to make this one - // simpler/shorter. It would return multiple search queries for the - // Github API such that all documents can be retrieved. This is - // required since Github returns a max of 1000 results per query, so - // multiple queries that split the search space into chunks of 1000 - // kustomization files is required. - ranges := []string{gc.query.String()} + // Since Github returns a max of 1000 results per query, we can use + // multiple queries that split the search space into chunks of at most + // 1000 files to get all of the data. + ranges, err := FindRangesForRepoSearch(newCache(gc.rc, gc.query)) + if err != nil { + return fmt.Errorf("could not split search into ranges, %v\n", + err) + } // Query each range for files. errs := make(multiError, 0) diff --git a/internal/search/crawler/github/split_search_ranges.go b/internal/search/crawler/github/split_search_ranges.go new file mode 100644 index 000000000..c098fa6ac --- /dev/null +++ b/internal/search/crawler/github/split_search_ranges.go @@ -0,0 +1,274 @@ +package github + +import ( + "fmt" + "math/bits" +) + +// Files cannot be more than 2^19 bytes, according to +// https://help.github.com/en/articles/searching-code#considerations-for-code-search +const ( + githubMaxFileSize = uint64(1 << 19) + githubMaxResultsPerQuery = uint64(1000) +) + +// Interface for testing purposes. Not expecting to have multiple +// implementations. +type cachedSearch interface { + CountResults(uint64) (uint64, error) + RequestString(filesize rangeFormatter) string +} + +// Cache uses bit tricks to be more efficient in detecting +// inconsistencies in the returned data from the Github API. +// Therefore, the cache expects a search to always start at 0, and +// it expects the max file size to be a power of 2. If this is to be changed +// there are a few considerations to keep in mind: +// +// 1. The cache is only efficient if the queries can be reused, so if +// the first chunk of files lives in the range 0..x, continuing the +// search for the next chunk from x+1..max (while asymptotically sane) +// may actually be less efficient since the cache is essentially reset +// at every interval. This leads to a larger number of requests in +// practice, and requests are what's expensive (rate limits). +// +// 2. The github API is not perfectly monotonic.. (this is somewhat +// problematic). The current cache implementation looks at the +// predecessor entry to find out if the current value is monotonic. +// This is where the bit trick is used, since each step in the binary +// search is adding or ommiting to add a decreasing of 2 to the query value, +// we can remove the least significant set bit to find the predecessor in +// constant time. Ultimately since the search is rate limited, we could also +// easily afford to compute this in linear time by iterating +// over cached values. +type githubCachedSearch struct { + cache map[uint64]uint64 + retryCount uint64 + baseRequest request +} + +func newCache(rc RequestConfig, query Query) githubCachedSearch { + return githubCachedSearch{ + cache: map[uint64]uint64{ + 0: 0, + }, + retryCount: rc.RetryCount(), + baseRequest: rc.CodeSearchRequestWith(query), + } +} + +func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { + count, cached := c.cache[upperBound] + if cached { + return count, nil + } + + sizeRange := RangeWithin{0, upperBound} + rangeRequest := c.RequestString(sizeRange) + + result := parseGithubResponse(rangeRequest, c.retryCount) + if result.Error != nil { + return count, result.Error + } + + // As range search uses powers of 2 for binary search, the previously + // cached value is easy to find by removing the least significant set + // bit from the current upperBound, since each step of the search adds + // least significant set bit. + // + // Finding the predecessor could also be implemented by iterating over + // the map to find the largest key that is smaller than upperBound if + // this approach deemed too complex. + trail := bits.TrailingZeros64(upperBound) + prev := uint64(0) + if trail != 64 { + prev = upperBound - (1 << uint64(trail)) + } + + // Sometimes the github API is not monotonically increasing, or ouputs + // an erroneous value of 0, or 1. This logic makes sure that it was not + // erroneous, and that the sequence continues to be monotonic by setting + // the current query count to match the previous value. which at least + // guarantees that the range search terminates. + // + // On the other hand, if files are added, then we way loose out on some + // files in a reviously completed range, but these files should be there + // the next time the crawler runs, so this is not really problematic. + retryMonotonicCount := 4 + for result.Parsed.TotalCount < c.cache[prev] { + logger.Printf( + "Retrying query... current lower bound: %d, got: %d\n", + c.cache[prev], result.Parsed.TotalCount) + + result = parseGithubResponse(rangeRequest, c.retryCount) + if result.Error != nil { + return count, result.Error + } + + retryMonotonicCount-- + if retryMonotonicCount <= 0 { + result.Parsed.TotalCount = c.cache[prev] + logger.Println( + "Retries for monotonic check exceeded,", + " setting value to match predecessor") + } + } + + count = result.Parsed.TotalCount + logger.Printf("Caching new query %s, with count %d\n", + sizeRange.RangeString(), count) + c.cache[upperBound] = count + return count, nil +} + +func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { + return c.baseRequest.CopyWith(Filesize(filesize)).URL() +} + +// Outputs a (possibly incomplete) list of ranges to query to find most search +// results as permissible by the search github search API. Github search only +// allows 1,000 results per query (paginated). +// Source: https://developer.github.com/v3/search/ +// +// This leaves the possibility of having file sizes with more than 1000 results, +// This would mean that the search as it is could not find all files. If queries +// are sorted by last indexed, and retrieved on regular intervals, it should be +// sufficient to get most if not all documents. +func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { + totalFiles, err := cache.CountResults(githubMaxFileSize) + if err != nil { + return nil, err + } + logger.Println("total files: ", totalFiles) + + if githubMaxResultsPerQuery >= totalFiles { + return []string{ + cache.RequestString(RangeWithin{0, githubMaxFileSize}), + }, nil + } + + // Find all the ranges of file sizes such that all files are queryable + // using the Github API. This does not compute an optimal ranges, since + // the number of queries needed to get the information required to + // compute an optimal range is expected to be much larger than the + // number of queries performed this way. + // + // The number of ranges is k = (number of files)/1000, and finding a + // range is logarithmic in the max file size (n = filesize). This means + // that preprocessing takes O(k * lg n) queries to find the ranges with + // a binary search over file sizes. + // + // My intuition is that this approach is competitive to a perfectly + // optimal solution, but I didn't actually take the time to do a + // rigurous proof. Intuitively, since files sizes are typically power + // law distibuted the binary search will be very skewed towards the + // smaller file ranges. This means that in practice this approach will + // make fewer than (#files/1000)*(log(n) = 19) queries for + // preprocessing, since it reuses a lot of the queries in the denser + // ranges. Furthermore, because of the distribution, it should be very + // easy to find ranges that are very close to the upper bound, up to + // the limiting factor of having no more than 1000 files accessible per + // range. + filesAccessible := uint64(0) + sizes := make([]uint64, 0) + for filesAccessible < totalFiles { + target := filesAccessible + githubMaxResultsPerQuery + if target >= totalFiles { + break + } + + logger.Printf("%d accessible files, next target = %d\n", + filesAccessible, target) + + cur, err := lowerBoundFileCount(cache, target) + if err != nil { + return nil, err + } + + // If there are more than 1000 files in the next bucket, we must + // advance anyway and lose out on some files :(. + if l := len(sizes); l > 0 && sizes[l-1] == cur { + cur++ + } + + nextAccessible, err := cache.CountResults(cur) + if err != nil { + return nil, fmt.Errorf( + "cache should be populated at %d already, got %v", + cur, err) + } + if nextAccessible < filesAccessible { + return nil, fmt.Errorf( + "Number of results dropped from %d to %d within range search", + filesAccessible, nextAccessible) + } + + filesAccessible = nextAccessible + if nextAccessible < totalFiles { + sizes = append(sizes, cur) + } + } + + return formatFilesizeRanges(cache, sizes), nil +} + +// lowerBoundFileCount finds the filesize range from [0, return value] that has +// the largest file count that is smaller than or equal to +// githubMaxResultsPerQuery. It is important to note that this returned value +// could already be in a previous range if the next file size has more than 1000 +// results. It is left to the caller to handle this bit of logic and guarantee +// forward progession in this case. +func lowerBoundFileCount( + cache cachedSearch, targetFileCount uint64) (uint64, error) { + + // Binary search for file sizes that make up the next <=1000 element + // chunk. + cur := uint64(0) + increase := githubMaxFileSize / 2 + + for increase > 0 { + mid := cur + increase + + count, err := cache.CountResults(mid) + if err != nil { + return count, err + } + + if count <= targetFileCount { + cur = mid + } + + if count == targetFileCount { + break + } + + increase /= 2 + } + + return cur, nil +} + +func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { + ranges := make([]string, 0, len(sizes)+1) + + if len(sizes) > 0 { + ranges = append(ranges, cache.RequestString( + RangeLessThan{sizes[0] + 1}, + )) + } + + for i := 0; i < len(sizes)-1; i += 1 { + ranges = append(ranges, cache.RequestString( + RangeWithin{sizes[i] + 1, sizes[i+1]}, + )) + + if i != len(sizes)-2 { + continue + } + ranges = append(ranges, cache.RequestString( + RangeGreaterThan{sizes[i+1]}, + )) + } + + return ranges +} diff --git a/internal/search/crawler/github/split_search_ranges_test.go b/internal/search/crawler/github/split_search_ranges_test.go new file mode 100644 index 000000000..c175486e6 --- /dev/null +++ b/internal/search/crawler/github/split_search_ranges_test.go @@ -0,0 +1,90 @@ +package github + +import ( + "fmt" + "reflect" + "testing" +) + +type testCachedSearch struct { + cache map[uint64]uint64 +} + +func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { + fmt.Printf("CountResults(%05x)\n", upperBound) + count, ok := c.cache[upperBound] + if !ok { + return count, fmt.Errorf("cache not set at %x", upperBound) + } + return count, nil +} + +func (c testCachedSearch) RequestString(filesize rangeFormatter) string { + return filesize.RangeString() +} + +// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test +// cache take in a list of (filesize, count) pairs and it can populate the cache +// without relying on how the implementation will create queries. This was only +// a quick and dirty test to make sure that modifications are not going to break +// the functionality. +func TestRangeSplitting(t *testing.T) { + // Keys follow the binary search depending on whether or not the range + // is too small/large to find close to optimal filesize ranges. This + // test is heavily tied to the fact that the search is using powers of two + // to make progress in the search (hence the use of hexadecimal values). + cache := testCachedSearch{ + map[uint64]uint64{ + 0x80000: 5000, + 0x40000: 5000, + 0x20000: 5000, + 0x10000: 5000, + 0x08000: 5000, + 0x04000: 5000, + 0x02000: 5000, + 0x01000: 5000, + 0x00fff: 3950, + 0x00ffe: 3950, + 0x00ffc: 3950, + 0x00ff8: 3950, + 0x00ff0: 3950, + 0x00fe0: 3950, + 0x00fc0: 3950, + 0x00f80: 3950, + 0x00f00: 3950, + 0x00e00: 3950, + 0x00c00: 3950, + 0x00800: 3950, + 0x00400: 3950, + 0x00200: 3688, + 0x00180: 3028, + 0x00100: 2999, + 0x000c0: 2448, + 0x00080: 1999, + 0x00070: 1600, + 0x0006c: 1003, + 0x0006b: 1001, + 0x0006a: 999, + 0x00068: 999, + 0x00060: 999, + 0x00040: 999, + 0x00000: 0, + }, + } + + requests, err := FindRangesForRepoSearch(cache) + if err != nil { + t.Errorf("Error while finding ranges: %v", err) + } + expected := []string{ + "<107", // cache.RequestString(RangeLessThan{0x6b}), + "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), + "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), + "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), + ">4095", // cache.RequestString(RangeGreaterThan{0xfff}), + } + + if !reflect.DeepEqual(requests, expected) { + t.Errorf("Expected requests (%v) to equal (%v)", requests, expected) + } +} From df779fd720afc2df6a1dd4461b578b0f5a34dff6 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Mon, 29 Jul 2019 20:25:45 -0700 Subject: [PATCH 6/8] Modify document for elasticsearch migration. --- internal/search/doc/doc.go | 196 ++++++++++++-------------------- internal/search/doc/doc_test.go | 121 +++++++------------- internal/search/go.mod | 1 - internal/search/go.sum | 18 --- 4 files changed, 110 insertions(+), 226 deletions(-) diff --git a/internal/search/doc/doc.go b/internal/search/doc/doc.go index 5a87c184d..38aa6505b 100644 --- a/internal/search/doc/doc.go +++ b/internal/search/doc/doc.go @@ -6,117 +6,53 @@ import ( "time" "sigs.k8s.io/yaml" - - "google.golang.org/appengine/search" ) -const ( - identifierStr = "identifier" - documentStr = "document" - repoURLStr = "repo_url" - filePathStr = "file_path" - creationTimeStr = "creation_time" -) - -// Represents an unbreakable character stream. -type Atom = search.Atom - -// Implements search.FieldLoadSaver in order to index this representation of a kustomization.yaml -// file. +// This document is meant to be used at the elasticsearch document type. +// Fields are serialized as-is to elasticsearch, where indices are built +// to facilitate text search queries. Identifiers, Values, FilePath, +// RepositoryURL and DocumentData are meant to be searched for text queries +// directly, while the other fields can either be used as a filter, or as +// additional metadata displayed in the UI. +// +// The fields of the document and their purpose are listed below: +// - DocumentData contains the contents of the kustomization file. +// - Kinds Represents the kubernetes Kinds that are in this file. +// - Identifiers are a list of (partial and full) identifier paths that can be +// found by users. Each part of a path is delimited by ":" e.g. spec:replicas. +// - Values are a list of identifier paths and their values that can be found by +// search queries. The path is delimited by ":" and the value follows the "=" +// symbol e.g. spec:replicas=4. +// - FilePath is the path of the file. +// - RepositoryURL is the URL of the source repository. +// - CreationTime is the time at which the file was created. +// +// Representing each Identifier and Value as a flat string representation +// facilitates the use of complex text search features from elasticsearch such +// as fuzzy searching, regex, wildcards, etc. type KustomizationDocument struct { - identifiers []Atom - FilePath Atom - RepositoryURL Atom - DocumentData string - CreationTime time.Time + DocumentData string `json:"document,omitempty"` + Kinds []string `json:"kinds,omitempty"` + Identifiers []string `json:"identifiers,omitempty"` + Values []string `json:"values,omitempty"` + FilePath string `json:"filePath,omitempty"` + RepositoryURL string `json:"repositoryUrl,omitempty"` + CreationTime time.Time `json:"creationTime,omitempty"` } -// Partially implements search.FieldLoadSaver. -func (k *KustomizationDocument) Load(fields []search.Field, metadata *search.DocumentMetadata) error { - k.identifiers = make([]search.Atom, 0) - wrongTypeError := func(name string, expected interface{}, actual interface{}) error { - return fmt.Errorf("%s expects type %T, found %#v", name, expected, actual) - } - - for _, f := range fields { - switch f.Name { - case identifierStr: - identifier, ok := f.Value.(search.Atom) - if !ok { - return wrongTypeError(f.Name, identifier, f.Value) - } - k.identifiers = append(k.identifiers, identifier) - - case documentStr: - document, ok := f.Value.(string) - if !ok { - return wrongTypeError(f.Name, document, f.Value) - } - k.DocumentData = document - - case filePathStr: - fp, ok := f.Value.(search.Atom) - if !ok { - return wrongTypeError(f.Name, fp, f.Value) - } - k.FilePath = fp - - case repoURLStr: - url, ok := f.Value.(search.Atom) - if !ok { - return wrongTypeError(f.Name, url, f.Value) - } - k.RepositoryURL = url - - case creationTimeStr: - time, ok := f.Value.(time.Time) - if !ok { - return wrongTypeError(f.Name, time, f.Value) - } - k.CreationTime = time - default: - return fmt.Errorf("KustomizationDocument field %s not recognized", f.Name) - } - } - - return nil -} - -// Partially implements search.FieldLoadSaver. -func (k *KustomizationDocument) Save() ([]search.Field, *search.DocumentMetadata, error) { - err := k.ParseYAML() - if err != nil { - return nil, nil, err - } - - extraFields := []search.Field{ - {Name: documentStr, Value: k.DocumentData}, - {Name: filePathStr, Value: k.FilePath}, - {Name: repoURLStr, Value: k.RepositoryURL}, - {Name: creationTimeStr, Value: k.CreationTime}, - } - - fields := make([]search.Field, 0, len(k.identifiers)+len(extraFields)) - for _, identifier := range k.identifiers { - fields = append(fields, search.Field{Name: identifierStr, Value: identifier}) - } - fields = append(fields, extraFields...) - - return fields, nil, nil -} - -func (k *KustomizationDocument) ParseYAML() error { - k.identifiers = make([]Atom, 0) +func (doc *KustomizationDocument) ParseYAML() error { + doc.Identifiers = make([]string, 0) + doc.Values = make([]string, 0) var kustomization map[string]interface{} - err := yaml.Unmarshal([]byte(k.DocumentData), &kustomization) + err := yaml.Unmarshal([]byte(doc.DocumentData), &kustomization) if err != nil { return fmt.Errorf("unable to parse kustomization file: %s", err) } type Map struct { data map[string]interface{} - prefix Atom + prefix string } toVisit := []Map{ @@ -126,43 +62,53 @@ func (k *KustomizationDocument) ParseYAML() error { }, } - atomJoin := func(vals ...interface{}) Atom { - strs := make([]string, 0, len(vals)) - for _, val := range vals { - strs = append(strs, fmt.Sprint(val)) - } - return Atom(strings.Trim(strings.Join(strs, " "), " ")) - } - - set := make(map[Atom]struct{}) - + identifierSet := make(map[string]struct{}) + valueSet := make(map[string]struct{}) for i := 0; i < len(toVisit); i++ { visiting := toVisit[i] for k, v := range visiting.data { - set[atomJoin(visiting.prefix, k)] = struct{}{} - switch value := v.(type) { - case map[string]interface{}: - toVisit = append(toVisit, Map{ - data: value, - prefix: atomJoin(visiting.prefix, fmt.Sprint(k)), - }) - case []interface{}: - for _, val := range value { - submap, ok := val.(map[string]interface{}) - if !ok { - continue - } + identifier := fmt.Sprintf("%s:%s", visiting.prefix, + strings.Replace(k, ":", "%3A", -1)) + // noop after the first iteration. + identifier = strings.TrimLeft(identifier, ":") + + // Recursive function traverses structure to find + // identifiers and values. These later get formatted + // into doc.Identifiers and doc.Values respectively. + var traverseStructure func(interface{}) + traverseStructure = func(arg interface{}) { + switch value := arg.(type) { + case map[string]interface{}: toVisit = append(toVisit, Map{ - data: submap, - prefix: atomJoin(visiting.prefix, fmt.Sprint(k)), + data: value, + prefix: identifier, }) + case []interface{}: + for _, val := range value { + traverseStructure(val) + } + case interface{}: + esc := strings.Replace(fmt.Sprintf("%v", + value), ":", "%3A", -1) + + valuePath := fmt.Sprintf("%s=%v", + identifier, esc) + valueSet[valuePath] = struct{}{} } } + traverseStructure(v) + + identifierSet[identifier] = struct{}{} + } } - for key := range set { - k.identifiers = append(k.identifiers, key) + for val := range valueSet { + doc.Values = append(doc.Values, val) + } + + for key := range identifierSet { + doc.Identifiers = append(doc.Identifiers, key) } return nil diff --git a/internal/search/doc/doc_test.go b/internal/search/doc/doc_test.go index a28ef6814..919d2bd94 100644 --- a/internal/search/doc/doc_test.go +++ b/internal/search/doc/doc_test.go @@ -1,82 +1,30 @@ package doc import ( - "fmt" "reflect" "sort" "strings" "testing" - "time" - - "google.golang.org/appengine/search" ) -func TestLoadFailures(t *testing.T) { - type sentinelType struct{} - sentinel := sentinelType{} - - testCases := [][]search.Field{ - {{Name: identifierStr, Value: sentinel}}, - {{Name: documentStr, Value: sentinel}}, - {{Name: repoURLStr, Value: sentinel}}, - {{Name: filePathStr, Value: sentinel}}, - {{Name: creationTimeStr, Value: sentinel}}, - } - - for _, test := range testCases { - var k KustomizationDocument - err := k.Load(test, nil) - if err == nil { - t.Errorf("Type missmatch %#v should not be loadable", test) - } - } -} - -func TestFieldLoadSaver(t *testing.T) { - - commonTestCases := []KustomizationDocument{ - { - identifiers: []Atom{"namePrefix", "metadata.name", "kind"}, - FilePath: "some/path/kustomization.yaml", - RepositoryURL: "https://example.com/kustomize", - CreationTime: time.Now(), - DocumentData: ` -namePrefix: dev- -metadata: - name: app -kind: Deployment -`, - }, - } - - for _, test := range commonTestCases { - fields, metadata, err := test.Save() - if err != nil { - t.Errorf("Error calling Save(): %s\n", err) - } - doc := KustomizationDocument{} - err = doc.Load(fields, metadata) - if err != nil { - t.Errorf("Doc failed to load: %s\n", err) - } - if !reflect.DeepEqual(test, doc) { - t.Errorf("Expected loaded document (%+v) to be equal to (%+v)\n", doc, test) - } - } -} - func TestParseYAML(t *testing.T) { testCases := []struct { - identifiers []Atom + identifiers []string + values []string yaml string }{ { - identifiers: []Atom{ + identifiers: []string{ "namePrefix", "metadata", - "metadata name", + "metadata:name", "kind", }, + values: []string{ + "namePrefix=dev-", + "metadata:name=app", + "kind=Deployment", + }, yaml: ` namePrefix: dev- metadata: @@ -85,18 +33,29 @@ kind: Deployment `, }, { - identifiers: []Atom{ + identifiers: []string{ "namePrefix", "metadata", - "metadata name", - "metadata spec", - "metadata spec replicas", + "metadata:name", + "metadata:spec", + "metadata:spec:replicas", "kind", "replicas", - "replicas name", - "replicas count", + "replicas:name", + "replicas:count", "resource", }, + values: []string{ + "namePrefix=dev-", + "metadata:name=n1", + "metadata:spec:replicas=3", + "kind=Deployment", + "replicas:name=n1", + "replicas:name=n2", + "replicas:count=3", + "resource=file1.yaml", + "resource=file2.yaml", + }, yaml: ` namePrefix: dev- # map of map @@ -121,14 +80,6 @@ resource: }, } - atomStrs := func(atoms []Atom) []string { - strs := make([]string, 0, len(atoms)) - for _, val := range atoms { - strs = append(strs, fmt.Sprintf("%v", val)) - } - return strs - } - for _, test := range testCases { doc := KustomizationDocument{ DocumentData: test.yaml, @@ -140,14 +91,20 @@ resource: t.Errorf("Document error error: %s", err) } - docIDs := atomStrs(doc.identifiers) - expectedIDs := atomStrs(test.identifiers) - sort.Strings(docIDs) - sort.Strings(expectedIDs) + cmpStrings := func(got, expected []string, label string) { + sort.Strings(got) + sort.Strings(expected) + + if !reflect.DeepEqual(got, expected) { + t.Errorf("Expected %s (%v) to be equal to (%v)\n", + label, + strings.Join(got, ","), + strings.Join(expected, ",")) + } - if !reflect.DeepEqual(docIDs, expectedIDs) { - t.Errorf("Expected loaded document (%v) to be equal to (%v)\n", - strings.Join(docIDs, ","), strings.Join(expectedIDs, ",")) } + + cmpStrings(doc.Identifiers, test.identifiers, "identifiers") + cmpStrings(doc.Values, test.values, "values") } } diff --git a/internal/search/go.mod b/internal/search/go.mod index 7bde01290..e408ff68c 100644 --- a/internal/search/go.mod +++ b/internal/search/go.mod @@ -3,7 +3,6 @@ module sigs.k8s.io/kustomize/internal/search go 1.12 require ( - google.golang.org/appengine v1.6.1 gopkg.in/yaml.v2 v2.2.2 // indirect sigs.k8s.io/yaml v1.1.0 ) diff --git a/internal/search/go.sum b/internal/search/go.sum index a93f1159e..60aa01b56 100644 --- a/internal/search/go.sum +++ b/internal/search/go.sum @@ -1,21 +1,3 @@ -github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= -github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65 h1:+rhAzEzT3f4JtomfC371qB+0Ola2caSKcY69NUBZrRQ= -golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= -google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I= -google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= From fe45157b2606d0316f4da3e018bed85ea9ddef5f Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 30 Jul 2019 16:01:00 -0700 Subject: [PATCH 7/8] Update crawler to cache web request form github. - Increase logging signal to noise ratio. - Allow to specify the `http.Client` for github requests. (This allows the use of caching http.Clients). - Clean up implementation. --- internal/search/crawler/github/crawler.go | 180 ++++++++++++------ internal/search/crawler/github/queries.go | 12 +- .../crawler/github/split_search_ranges.go | 12 +- internal/search/go.mod | 2 + internal/search/go.sum | 4 + internal/search/httpclient/httpclient.go | 19 ++ 6 files changed, 150 insertions(+), 79 deletions(-) create mode 100644 internal/search/httpclient/httpclient.go diff --git a/internal/search/crawler/github/crawler.go b/internal/search/crawler/github/crawler.go index 8c7c81f57..1195ed07d 100644 --- a/internal/search/crawler/github/crawler.go +++ b/internal/search/crawler/github/crawler.go @@ -24,18 +24,27 @@ var logger = log.New(os.Stdout, "Github Crawler: ", // Implements crawler.Crawler. type githubCrawler struct { - rc RequestConfig - query Query + client GitHubClient + query Query } -func NewCrawler( - accessToken string, retryCount uint64, query Query) githubCrawler { +type GitHubClient struct { + RequestConfig + retryCount uint64 + client *http.Client +} + +func NewCrawler(accessToken string, retryCount uint64, client *http.Client, + query Query) githubCrawler { return githubCrawler{ - rc: RequestConfig{ - perPage: githubMaxPageSize, - retryCount: retryCount, - accessToken: accessToken, + client: GitHubClient{ + retryCount: retryCount, + client: client, + RequestConfig: RequestConfig{ + perPage: githubMaxPageSize, + accessToken: accessToken, + }, }, query: query, } @@ -45,19 +54,27 @@ func NewCrawler( func (gc githubCrawler) Crawl( ctx context.Context, output chan<- *doc.KustomizationDocument) error { + noETagClient := GitHubClient{ + RequestConfig: gc.client.RequestConfig, + client: &http.Client{Timeout: gc.client.client.Timeout}, + retryCount: gc.client.retryCount, + } + // Since Github returns a max of 1000 results per query, we can use // multiple queries that split the search space into chunks of at most // 1000 files to get all of the data. - ranges, err := FindRangesForRepoSearch(newCache(gc.rc, gc.query)) + ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query)) if err != nil { - return fmt.Errorf("could not split search into ranges, %v\n", - err) + return fmt.Errorf("could not split %v into ranges, %v\n", + gc.query, err) } + logger.Println("ranges: ", ranges) + // Query each range for files. errs := make(multiError, 0) for _, query := range ranges { - err := processQuery(ctx, gc.rc, query, output) + err := processQuery(ctx, gc.client, query, output) if err != nil { errs = append(errs, err) } @@ -68,7 +85,7 @@ func (gc githubCrawler) Crawl( // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. -func processQuery(ctx context.Context, rc RequestConfig, query string, +func processQuery(ctx context.Context, gcl GitHubClient, query string, output chan<- *doc.KustomizationDocument) error { queryPages := make(chan GithubResponseInfo) @@ -77,8 +94,7 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, // Forward the document metadata to the retrieval channel. // This separation allows for concurrent requests for the code // search, and the retrieval portions of the API. - err := ForwardPaginatedQuery( - ctx, query, rc.retryCount, queryPages) + err := gcl.ForwardPaginatedQuery(ctx, query, queryPages) if err != nil { // TODO(damienr74) handle this error with redis? logger.Println(err) @@ -87,6 +103,8 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, }() errs := make(multiError, 0) + errorCnt := 0 + totalCnt := 0 for page := range queryPages { if page.Error != nil { errs = append(errs, page.Error) @@ -101,35 +119,40 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, // search when we find a file that has been seen, or we // can choose to selectively update files. - k, err := kustomizationResultAdapter(rc, file) + k, err := kustomizationResultAdapter(gcl, file) if err != nil { errs = append(errs, err) + errorCnt++ continue } output <- k + totalCnt++ } + + logger.Printf("got %d files out of %d from API. %d of %d had errors\n", + totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt) } return errs } -func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( +func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) ( *doc.KustomizationDocument, error) { - data, err := GetFileData(rc, k) + data, err := gcl.GetFileData(k) if err != nil { return nil, err } - creationTime, err := GetFileCreationTime(rc, k) + creationTime, err := gcl.GetFileCreationTime(k) if err != nil { - logger.Printf("(Error: %v) initializing to current time.", err) + logger.Printf("(error: %v) initializing to current time.", err) } doc := doc.KustomizationDocument{ DocumentData: string(data), - FilePath: doc.Atom(k.Path), - RepositoryURL: doc.Atom(k.Repository.URL), + FilePath: k.Path, + RepositoryURL: k.Repository.URL, CreationTime: creationTime, } @@ -139,10 +162,12 @@ func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( // ForwardPaginatedQuery follows the links to the next pages and performs all of // the queries for a given search query, relaying the data from each request // back to an output channel. -func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, +func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string, output chan<- GithubResponseInfo) error { - response := parseGithubResponse(query, retryCount) + logger.Println("querying: ", query) + response := gcl.parseGithubResponse(query) + if response.Error != nil { return response.Error } @@ -154,7 +179,7 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, case <-ctx.Done(): return nil default: - response = parseGithubResponse(response.NextURL, retryCount) + response = gcl.parseGithubResponse(response.NextURL) if response.Error != nil { return response.Error } @@ -167,20 +192,22 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, } // GetFileData gets the bytes from a file. -func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { +func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) { - url := rc.ContentsRequest(k.Repository.FullName, k.Path) + url := gcl.ContentsRequest(k.Repository.FullName, k.Path) - logger.Println("content-url ", url) - resp, err := GetReposData(url, rc.RetryCount()) + resp, err := gcl.GetReposData(url) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v", + k, url, err) } data, err := ioutil.ReadAll(resp.Body) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v", + k, url, err) } + resp.Body.Close() type githubContentRawURL struct { DownloadURL string `json:"download_url,omitempty"` @@ -188,30 +215,33 @@ func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { var rawURL githubContentRawURL err = json.Unmarshal(data, &rawURL) if err != nil { - return nil, err + return nil, fmt.Errorf( + "%+v: could not get 'download_url' from '%s' response: %v", + k, data, err) } - logger.Println("raw-data-url", rawURL.DownloadURL) - resp, err = GetReposData(rawURL.DownloadURL, rc.RetryCount()) + resp, err = gcl.GetRawUserContent(rawURL.DownloadURL) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v", + k, rawURL.DownloadURL, err) } + defer resp.Body.Close() return ioutil.ReadAll(resp.Body) } // GetFileCreationTime gets the earliest date of a file. -func GetFileCreationTime( - rc RequestConfig, k GithubFileSpec) (time.Time, error) { +func (gcl GitHubClient) GetFileCreationTime( + k GithubFileSpec) (time.Time, error) { - url := rc.CommitsRequest(k.Repository.FullName, k.Path) + url := gcl.CommitsRequest(k.Repository.FullName, k.Path) defaultTime := time.Now() - logger.Println("commits-url", url) - resp, err := GetReposData(url, rc.RetryCount()) + resp, err := gcl.GetReposData(url) if err != nil { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", k, url, err) } type DateSpec struct { @@ -224,18 +254,27 @@ func GetFileCreationTime( _, lastURL := parseGithubLinkFormat(resp.Header.Get("link")) if lastURL != "" { - resp, err = GetReposData(lastURL, rc.RetryCount()) + resp, err = gcl.GetReposData(lastURL) if err != nil { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", + k, lastURL, err) } } + defer resp.Body.Close() data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return defaultTime, fmt.Errorf( + "%+v: failed to read metadata: %v", k, err) + } earliestDate := []DateSpec{} err = json.Unmarshal(data, &earliestDate) size := len(earliestDate) if err != nil || size == 0 { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: server response '%s' not in expected format: %v", + k, data, err) } return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date) @@ -244,6 +283,9 @@ func GetFileCreationTime( // TODO(damienr74) change the tickers to actually check api rate limits, reset // times, and throttle requests dynamically based off of current utilization, // instead of hardcoding the documented values, these calls are not quota'd. +// This is now especially important, since caching the API requests will reduce +// API quota use (so we can actually make more requests in the allotted time +// period). // // See https://developer.github.com/v3/rate_limit/ for details. var ( @@ -334,8 +376,8 @@ func parseGithubLinkFormat(links string) (string, string) { return next, last } -func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInfo { - resp, err := SearchGithubAPI(getRequest, retryCount) +func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo { + resp, err := gcl.SearchGithubAPI(getRequest) requestInfo := GithubResponseInfo{ Response: resp, Error: err, @@ -347,17 +389,18 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf } var data []byte + defer resp.Body.Close() data, requestInfo.Error = ioutil.ReadAll(resp.Body) if requestInfo.Error != nil { return requestInfo } if resp.StatusCode != http.StatusOK { - logger.Println("Query: ", getRequest) - logger.Println("Status not OK at the source") - logger.Println("Header Dump", resp.Header) - logger.Println("Body Dump", string(data)) - requestInfo.Error = fmt.Errorf("Request Rejected, Status '%s'", + logger.Println("query: ", getRequest) + logger.Println("status not OK at the source") + logger.Println("header dump", resp.Header) + logger.Println("body dump", string(data)) + requestInfo.Error = fmt.Errorf("request rejected, status '%s'", resp.Status) return requestInfo } @@ -382,23 +425,30 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf // SearchGithubAPI performs a search query and handles rate limitting for // the 'code/search?' endpoint as well as timed retries in the case of abuse // prevention. -func SearchGithubAPI(query string, retryCount uint64) (*http.Response, error) { +func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) { throttleSearchAPI() - return getWithRetry(query, retryCount) + return gcl.getWithRetry(query) } // GetReposData performs a search query and handles rate limitting for // the '/repos' endpoint as well as timed retries in the case of abuse // prevention. -func GetReposData(query string, retryCount uint64) (*http.Response, error) { +func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) { throttleRepoAPI() - return getWithRetry(query, retryCount) + return gcl.getWithRetry(query) } -func getWithRetry( - query string, retryCount uint64) (resp *http.Response, err error) { +// User content (file contents) is not API rate limited, so there's no use in +// throttling this call. +func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) { + return gcl.getWithRetry(query) +} - resp, err = http.Get(query) +func (gcl GitHubClient) getWithRetry( + query string) (resp *http.Response, err error) { + + resp, err = gcl.client.Get(query) + retryCount := gcl.retryCount for err == nil && resp.StatusCode == http.StatusForbidden && @@ -407,15 +457,21 @@ func getWithRetry( retryTime := resp.Header.Get("Retry-After") i, err := strconv.Atoi(retryTime) if err != nil { - return resp, fmt.Errorf("Forbidden without 'Retry-After'") + return resp, fmt.Errorf( + "query '%s' forbidden without 'Retry-After'", query) } logger.Printf( - "Status Forbidden, retring %d more times\n", retryCount) + "status forbidden, retring %d more times\n", retryCount) - logger.Printf("Waiting %d seconds before retrying\n", i) + logger.Printf("waiting %d seconds before retrying\n", i) time.Sleep(time.Second * time.Duration(i)) retryCount-- - resp, err = http.Get(query) + resp, err = gcl.client.Get(query) + } + + if err != nil { + return resp, fmt.Errorf("query '%s' could not be processed, %v", + query, err) } return resp, err diff --git a/internal/search/crawler/github/queries.go b/internal/search/crawler/github/queries.go index 7dc307914..f8ec23eea 100644 --- a/internal/search/crawler/github/queries.go +++ b/internal/search/crawler/github/queries.go @@ -99,16 +99,12 @@ func Path(p string) queryField { // determine the date of a file. type RequestConfig struct { perPage uint64 - retryCount uint64 accessToken string } -func NewRequestConfig( - perPage, retryCount uint64, accessToken string) RequestConfig { - +func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { return RequestConfig{ perPage: perPage, - retryCount: retryCount, accessToken: accessToken, } } @@ -138,12 +134,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { return rc.makeRequest(uri, Query{Path(path)}).URL() } -// How many times to retry the queries before giving up (used by the crawler, -// not Github). -func (rc RequestConfig) RetryCount() uint64 { - return rc.retryCount -} - func (rc RequestConfig) makeRequest(path string, query Query) request { vals := url.Values{} if rc.accessToken != "" { diff --git a/internal/search/crawler/github/split_search_ranges.go b/internal/search/crawler/github/split_search_ranges.go index c098fa6ac..58e445997 100644 --- a/internal/search/crawler/github/split_search_ranges.go +++ b/internal/search/crawler/github/split_search_ranges.go @@ -43,17 +43,17 @@ type cachedSearch interface { // over cached values. type githubCachedSearch struct { cache map[uint64]uint64 - retryCount uint64 + gcl GitHubClient baseRequest request } -func newCache(rc RequestConfig, query Query) githubCachedSearch { +func newCache(client GitHubClient, query Query) githubCachedSearch { return githubCachedSearch{ cache: map[uint64]uint64{ 0: 0, }, - retryCount: rc.RetryCount(), - baseRequest: rc.CodeSearchRequestWith(query), + gcl: client, + baseRequest: client.CodeSearchRequestWith(query), } } @@ -66,7 +66,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { sizeRange := RangeWithin{0, upperBound} rangeRequest := c.RequestString(sizeRange) - result := parseGithubResponse(rangeRequest, c.retryCount) + result := c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } @@ -100,7 +100,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { "Retrying query... current lower bound: %d, got: %d\n", c.cache[prev], result.Parsed.TotalCount) - result = parseGithubResponse(rangeRequest, c.retryCount) + result = c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } diff --git a/internal/search/go.mod b/internal/search/go.mod index e408ff68c..0d7da9db8 100644 --- a/internal/search/go.mod +++ b/internal/search/go.mod @@ -3,6 +3,8 @@ module sigs.k8s.io/kustomize/internal/search go 1.12 require ( + github.com/gomodule/redigo v2.0.0+incompatible + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 gopkg.in/yaml.v2 v2.2.2 // indirect sigs.k8s.io/yaml v1.1.0 ) diff --git a/internal/search/go.sum b/internal/search/go.sum index 60aa01b56..1a7e918fd 100644 --- a/internal/search/go.sum +++ b/internal/search/go.sum @@ -1,3 +1,7 @@ +github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0= +github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= diff --git a/internal/search/httpclient/httpclient.go b/internal/search/httpclient/httpclient.go new file mode 100644 index 000000000..417c587fb --- /dev/null +++ b/internal/search/httpclient/httpclient.go @@ -0,0 +1,19 @@ +package httpclient + +import ( + "net/http" + "time" + + "github.com/gomodule/redigo/redis" + "github.com/gregjones/httpcache" + redis_cache "github.com/gregjones/httpcache/redis" +) + +func NewClient(conn redis.Conn) *http.Client { + etagCache := redis_cache.NewWithClient(conn) + tr := httpcache.NewTransport(etagCache) + return &http.Client{ + Transport: tr, + Timeout: 10 * time.Second, + } +} From d3022ccd656a29c02dc682839fb9189644548fa5 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Fri, 16 Aug 2019 09:47:36 -0700 Subject: [PATCH 8/8] rename to tools directory --- internal/{search => tools}/crawler/crawler.go | 2 +- internal/{search => tools}/crawler/crawler_test.go | 2 +- internal/{search => tools}/crawler/github/crawler.go | 2 +- internal/{search => tools}/crawler/github/queries.go | 0 internal/{search => tools}/crawler/github/queries_test.go | 0 .../{search => tools}/crawler/github/split_search_ranges.go | 0 .../crawler/github/split_search_ranges_test.go | 0 internal/{search => tools}/doc/doc.go | 0 internal/{search => tools}/doc/doc_test.go | 0 internal/{search => tools}/go.mod | 2 +- internal/{search => tools}/go.sum | 0 internal/{search => tools}/httpclient/httpclient.go | 0 travis/pre-commit.sh | 1 - 13 files changed, 4 insertions(+), 5 deletions(-) rename internal/{search => tools}/crawler/crawler.go (98%) rename internal/{search => tools}/crawler/crawler_test.go (98%) rename internal/{search => tools}/crawler/github/crawler.go (99%) rename internal/{search => tools}/crawler/github/queries.go (100%) rename internal/{search => tools}/crawler/github/queries_test.go (100%) rename internal/{search => tools}/crawler/github/split_search_ranges.go (100%) rename internal/{search => tools}/crawler/github/split_search_ranges_test.go (100%) rename internal/{search => tools}/doc/doc.go (100%) rename internal/{search => tools}/doc/doc_test.go (100%) rename internal/{search => tools}/go.mod (81%) rename internal/{search => tools}/go.sum (100%) rename internal/{search => tools}/httpclient/httpclient.go (100%) diff --git a/internal/search/crawler/crawler.go b/internal/tools/crawler/crawler.go similarity index 98% rename from internal/search/crawler/crawler.go rename to internal/tools/crawler/crawler.go index 499da8dbe..98dccd626 100644 --- a/internal/search/crawler/crawler.go +++ b/internal/tools/crawler/crawler.go @@ -8,7 +8,7 @@ import ( "fmt" "sync" - "sigs.k8s.io/kustomize/internal/search/doc" + "sigs.k8s.io/kustomize/internal/tools/doc" ) // Crawler forwards documents from source repositories to index and store them diff --git a/internal/search/crawler/crawler_test.go b/internal/tools/crawler/crawler_test.go similarity index 98% rename from internal/search/crawler/crawler_test.go rename to internal/tools/crawler/crawler_test.go index b8a6af86d..094c86d35 100644 --- a/internal/search/crawler/crawler_test.go +++ b/internal/tools/crawler/crawler_test.go @@ -8,7 +8,7 @@ import ( "sync" "testing" - "sigs.k8s.io/kustomize/internal/search/doc" + "sigs.k8s.io/kustomize/internal/tools/doc" ) // Simple crawler that forwards it's list of documents to a provided channel and diff --git a/internal/search/crawler/github/crawler.go b/internal/tools/crawler/github/crawler.go similarity index 99% rename from internal/search/crawler/github/crawler.go rename to internal/tools/crawler/github/crawler.go index 1195ed07d..005570d99 100644 --- a/internal/search/crawler/github/crawler.go +++ b/internal/tools/crawler/github/crawler.go @@ -16,7 +16,7 @@ import ( "strings" "time" - "sigs.k8s.io/kustomize/internal/search/doc" + "sigs.k8s.io/kustomize/internal/tools/doc" ) var logger = log.New(os.Stdout, "Github Crawler: ", diff --git a/internal/search/crawler/github/queries.go b/internal/tools/crawler/github/queries.go similarity index 100% rename from internal/search/crawler/github/queries.go rename to internal/tools/crawler/github/queries.go diff --git a/internal/search/crawler/github/queries_test.go b/internal/tools/crawler/github/queries_test.go similarity index 100% rename from internal/search/crawler/github/queries_test.go rename to internal/tools/crawler/github/queries_test.go diff --git a/internal/search/crawler/github/split_search_ranges.go b/internal/tools/crawler/github/split_search_ranges.go similarity index 100% rename from internal/search/crawler/github/split_search_ranges.go rename to internal/tools/crawler/github/split_search_ranges.go diff --git a/internal/search/crawler/github/split_search_ranges_test.go b/internal/tools/crawler/github/split_search_ranges_test.go similarity index 100% rename from internal/search/crawler/github/split_search_ranges_test.go rename to internal/tools/crawler/github/split_search_ranges_test.go diff --git a/internal/search/doc/doc.go b/internal/tools/doc/doc.go similarity index 100% rename from internal/search/doc/doc.go rename to internal/tools/doc/doc.go diff --git a/internal/search/doc/doc_test.go b/internal/tools/doc/doc_test.go similarity index 100% rename from internal/search/doc/doc_test.go rename to internal/tools/doc/doc_test.go diff --git a/internal/search/go.mod b/internal/tools/go.mod similarity index 81% rename from internal/search/go.mod rename to internal/tools/go.mod index 0d7da9db8..68ae4f06b 100644 --- a/internal/search/go.mod +++ b/internal/tools/go.mod @@ -1,4 +1,4 @@ -module sigs.k8s.io/kustomize/internal/search +module sigs.k8s.io/kustomize/internal/tools go 1.12 diff --git a/internal/search/go.sum b/internal/tools/go.sum similarity index 100% rename from internal/search/go.sum rename to internal/tools/go.sum diff --git a/internal/search/httpclient/httpclient.go b/internal/tools/httpclient/httpclient.go similarity index 100% rename from internal/search/httpclient/httpclient.go rename to internal/tools/httpclient/httpclient.go diff --git a/travis/pre-commit.sh b/travis/pre-commit.sh index 8d55f2b2b..ba34d5723 100755 --- a/travis/pre-commit.sh +++ b/travis/pre-commit.sh @@ -30,7 +30,6 @@ function testGoLangCILint { function testGoTest { go test -v ./... - (cd ./internal/search; go test -v ./...) } # These tests require the helm program, and at the moment