diff --git a/internal/tools/crawler/crawler.go b/internal/tools/crawler/crawler.go new file mode 100644 index 000000000..98dccd626 --- /dev/null +++ b/internal/tools/crawler/crawler.go @@ -0,0 +1,76 @@ +// Package crawler provides helper methods and defines an interface for lauching +// source repository crawlers that retrieve files from a source and forwards +// to a channel for indexing and retrieval. +package crawler + +import ( + "context" + "fmt" + "sync" + + "sigs.k8s.io/kustomize/internal/tools/doc" +) + +// Crawler forwards documents from source repositories to index and store them +// for searching. Each crawler is responsible for querying it's source of +// information, and forwarding files that have not been seen before or that need +// updating. +type Crawler interface { + // Crawl returns when it is done processing. This method does not take + // ownership of the channel. The channel is write only, and it + // designates where the crawler should forward the documents. + Crawl(ctx context.Context, output chan<- *doc.KustomizationDocument) error +} + +// CrawlerRunner is a blocking function and only returns once all of the +// crawlers are finished with execution. +// +// This function uses the output channel to forward kustomization documents +// from a list of crawlers. The output is to be consumed by a database/search +// indexer for later retrieval. +// +// The return value is an array of errors in which each index represents the +// index of the crawler that emitted the error. Although the errors themselves +// can be nil, the array will always be exactly the size of the crawlers array. +func CrawlerRunner(ctx context.Context, + output chan<- *doc.KustomizationDocument, crawlers []Crawler) []error { + + errs := make([]error, len(crawlers)) + wg := sync.WaitGroup{} + + for i, crawler := range crawlers { + // Crawler implementations get their own channels to prevent a + // crawler from closing the main output channel. + docs := make(chan *doc.KustomizationDocument) + wg.Add(2) + + // Forward all of the documents from this crawler's channel to + // the main output channel. + go func(docs <-chan *doc.KustomizationDocument) { + defer wg.Done() + for doc := range docs { + output <- doc + } + }(docs) + + // Run this crawler and capture its returned error. + go func(idx int, crawler Crawler, + docs chan<- *doc.KustomizationDocument) { + + defer func() { + wg.Done() + if r := recover(); r != nil { + errs[idx] = fmt.Errorf( + "%+v panicked: %v, additional error %v", + crawler, r, errs[idx], + ) + } + }() + defer close(docs) + errs[idx] = crawler.Crawl(ctx, docs) + }(i, crawler, docs) // Copies the index and the crawler + } + + wg.Wait() + return errs +} diff --git a/internal/tools/crawler/crawler_test.go b/internal/tools/crawler/crawler_test.go new file mode 100644 index 000000000..094c86d35 --- /dev/null +++ b/internal/tools/crawler/crawler_test.go @@ -0,0 +1,124 @@ +package crawler + +import ( + "context" + "errors" + "reflect" + "sort" + "sync" + "testing" + + "sigs.k8s.io/kustomize/internal/tools/doc" +) + +// Simple crawler that forwards it's list of documents to a provided channel and +// returns it's error to the caller. +type testCrawler struct { + docs []doc.KustomizationDocument + err error +} + +// Crawl implements the Crawler interface for testing. +func (c testCrawler) Crawl(ctx context.Context, + output chan<- *doc.KustomizationDocument) error { + + for i := range c.docs { + output <- &c.docs[i] + } + return c.err +} + +// Used to make sure that we're comparing documents in order. This is needed +// since these documents will be sent concurrently. +type sortableDocs []doc.KustomizationDocument + +func (s sortableDocs) Less(i, j int) bool { + return s[i].FilePath < s[j].FilePath +} + +func (s sortableDocs) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +func (s sortableDocs) Len() int { + return len(s) +} + +func TestCrawlerRunner(t *testing.T) { + tests := []struct { + tc []Crawler + errs []error + docs sortableDocs + }{ + { + tc: []Crawler{ + testCrawler{ + docs: []doc.KustomizationDocument{ + {FilePath: "crawler1/doc1"}, + {FilePath: "crawler1/doc2"}, + {FilePath: "crawler1/doc3"}, + }, + }, + testCrawler{err: errors.New("crawler2")}, + testCrawler{}, + testCrawler{ + docs: []doc.KustomizationDocument{ + {FilePath: "crawler4/doc1"}, + {FilePath: "crawler4/doc2"}, + }, + err: errors.New("crawler4"), + }, + }, + errs: []error{ + nil, + errors.New("crawler2"), + nil, + errors.New("crawler4"), + }, + docs: sortableDocs{ + {FilePath: "crawler1/doc1"}, + {FilePath: "crawler1/doc2"}, + {FilePath: "crawler1/doc3"}, + {FilePath: "crawler4/doc1"}, + {FilePath: "crawler4/doc2"}, + }, + }, + } + + for _, test := range tests { + output := make(chan *doc.KustomizationDocument) + wg := sync.WaitGroup{} + wg.Add(1) + + // Run the Crawler runner with a list of crawlers. + go func() { + defer close(output) + defer wg.Done() + + errs := CrawlerRunner(context.Background(), output, + test.tc) + + // Check that errors are returned as they should be. + if !reflect.DeepEqual(errs, test.errs) { + t.Errorf("Expected errs (%v) to equal (%v)", + errs, test.errs) + } + + }() + + // Iterate over the output channel of Crawler runner. + returned := make(sortableDocs, 0, len(test.docs)) + for doc := range output { + returned = append(returned, *doc) + } + + // Check that all documents are received. + sort.Sort(returned) + if !reflect.DeepEqual(returned, test.docs) { + t.Errorf("Expected docs (%v) to equal (%v)\n", + returned, test.docs) + } + + wg.Wait() + } +} diff --git a/internal/tools/crawler/github/crawler.go b/internal/tools/crawler/github/crawler.go new file mode 100644 index 000000000..005570d99 --- /dev/null +++ b/internal/tools/crawler/github/crawler.go @@ -0,0 +1,478 @@ +// Package github implements the crawler.Crawler interface, getting data +// from the Github search API. +package github + +import ( + "context" + "encoding/json" + "fmt" + "io/ioutil" + "log" + "math" + "net/http" + "os" + "regexp" + "strconv" + "strings" + "time" + + "sigs.k8s.io/kustomize/internal/tools/doc" +) + +var logger = log.New(os.Stdout, "Github Crawler: ", + log.LstdFlags|log.LUTC|log.Llongfile) + +// Implements crawler.Crawler. +type githubCrawler struct { + client GitHubClient + query Query +} + +type GitHubClient struct { + RequestConfig + retryCount uint64 + client *http.Client +} + +func NewCrawler(accessToken string, retryCount uint64, client *http.Client, + query Query) githubCrawler { + + return githubCrawler{ + client: GitHubClient{ + retryCount: retryCount, + client: client, + RequestConfig: RequestConfig{ + perPage: githubMaxPageSize, + accessToken: accessToken, + }, + }, + query: query, + } +} + +// Implements crawler.Crawler. +func (gc githubCrawler) Crawl( + ctx context.Context, output chan<- *doc.KustomizationDocument) error { + + noETagClient := GitHubClient{ + RequestConfig: gc.client.RequestConfig, + client: &http.Client{Timeout: gc.client.client.Timeout}, + retryCount: gc.client.retryCount, + } + + // Since Github returns a max of 1000 results per query, we can use + // multiple queries that split the search space into chunks of at most + // 1000 files to get all of the data. + ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query)) + if err != nil { + return fmt.Errorf("could not split %v into ranges, %v\n", + gc.query, err) + } + + logger.Println("ranges: ", ranges) + + // Query each range for files. + errs := make(multiError, 0) + for _, query := range ranges { + err := processQuery(ctx, gc.client, query, output) + if err != nil { + errs = append(errs, err) + } + } + + return errs +} + +// processQuery follows all of the pages in a query, and updates/adds the +// documents from the crawl to the datastore/index. +func processQuery(ctx context.Context, gcl GitHubClient, query string, + output chan<- *doc.KustomizationDocument) error { + + queryPages := make(chan GithubResponseInfo) + + go func() { + // Forward the document metadata to the retrieval channel. + // This separation allows for concurrent requests for the code + // search, and the retrieval portions of the API. + err := gcl.ForwardPaginatedQuery(ctx, query, queryPages) + if err != nil { + // TODO(damienr74) handle this error with redis? + logger.Println(err) + } + close(queryPages) + }() + + errs := make(multiError, 0) + errorCnt := 0 + totalCnt := 0 + for page := range queryPages { + if page.Error != nil { + errs = append(errs, page.Error) + continue + } + + for _, file := range page.Parsed.Items { + // TODO(damienr74) This is where we'd need to + // communicate with redis. Currently always doing a full + // reindex of the documents. Since the documents are in + // sorted order in each bucket, we can short circuit the + // search when we find a file that has been seen, or we + // can choose to selectively update files. + + k, err := kustomizationResultAdapter(gcl, file) + if err != nil { + errs = append(errs, err) + errorCnt++ + continue + } + output <- k + totalCnt++ + } + + logger.Printf("got %d files out of %d from API. %d of %d had errors\n", + totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt) + } + + return errs +} + +func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) ( + *doc.KustomizationDocument, error) { + + data, err := gcl.GetFileData(k) + if err != nil { + return nil, err + } + + creationTime, err := gcl.GetFileCreationTime(k) + if err != nil { + logger.Printf("(error: %v) initializing to current time.", err) + } + + doc := doc.KustomizationDocument{ + DocumentData: string(data), + FilePath: k.Path, + RepositoryURL: k.Repository.URL, + CreationTime: creationTime, + } + + return &doc, nil +} + +// ForwardPaginatedQuery follows the links to the next pages and performs all of +// the queries for a given search query, relaying the data from each request +// back to an output channel. +func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string, + output chan<- GithubResponseInfo) error { + + logger.Println("querying: ", query) + response := gcl.parseGithubResponse(query) + + if response.Error != nil { + return response.Error + } + + output <- response + + for response.LastURL != "" && response.NextURL != "" { + select { + case <-ctx.Done(): + return nil + default: + response = gcl.parseGithubResponse(response.NextURL) + if response.Error != nil { + return response.Error + } + + output <- response + } + } + + return nil +} + +// GetFileData gets the bytes from a file. +func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) { + + url := gcl.ContentsRequest(k.Repository.FullName, k.Path) + + resp, err := gcl.GetReposData(url) + if err != nil { + return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v", + k, url, err) + } + + data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v", + k, url, err) + } + resp.Body.Close() + + type githubContentRawURL struct { + DownloadURL string `json:"download_url,omitempty"` + } + var rawURL githubContentRawURL + err = json.Unmarshal(data, &rawURL) + if err != nil { + return nil, fmt.Errorf( + "%+v: could not get 'download_url' from '%s' response: %v", + k, data, err) + } + + resp, err = gcl.GetRawUserContent(rawURL.DownloadURL) + if err != nil { + return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v", + k, rawURL.DownloadURL, err) + } + + defer resp.Body.Close() + return ioutil.ReadAll(resp.Body) +} + +// GetFileCreationTime gets the earliest date of a file. +func (gcl GitHubClient) GetFileCreationTime( + k GithubFileSpec) (time.Time, error) { + + url := gcl.CommitsRequest(k.Repository.FullName, k.Path) + + defaultTime := time.Now() + + resp, err := gcl.GetReposData(url) + if err != nil { + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", k, url, err) + } + + type DateSpec struct { + Commit struct { + Author struct { + Date string `json:"date,omitempty"` + } `json:"author,omitempty"` + } `json:"commit,omitempty"` + } + + _, lastURL := parseGithubLinkFormat(resp.Header.Get("link")) + if lastURL != "" { + resp, err = gcl.GetReposData(lastURL) + if err != nil { + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", + k, lastURL, err) + } + } + + defer resp.Body.Close() + data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return defaultTime, fmt.Errorf( + "%+v: failed to read metadata: %v", k, err) + } + earliestDate := []DateSpec{} + err = json.Unmarshal(data, &earliestDate) + size := len(earliestDate) + if err != nil || size == 0 { + return defaultTime, fmt.Errorf( + "%+v: server response '%s' not in expected format: %v", + k, data, err) + } + + return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date) +} + +// TODO(damienr74) change the tickers to actually check api rate limits, reset +// times, and throttle requests dynamically based off of current utilization, +// instead of hardcoding the documented values, these calls are not quota'd. +// This is now especially important, since caching the API requests will reduce +// API quota use (so we can actually make more requests in the allotted time +// period). +// +// See https://developer.github.com/v3/rate_limit/ for details. +var ( + searchRateTicker = time.NewTicker(time.Second * 2) + contentRateTicker = time.NewTicker(time.Second * 1) +) + +func throttleSearchAPI() { + <-searchRateTicker.C +} + +func throttleRepoAPI() { + <-contentRateTicker.C +} + +const ( + accessTokenKeyword = "access_token=" + perPageKeyword = "per_page=" + contentSearchURL = "https://api.github.com/repos" + contentKeyword = "contents" +) + +type multiError []error + +func (me multiError) Error() string { + size := len(me) + 2 + strs := make([]string, size) + strs[0] = "Errors [\n\t" + for i, err := range me { + strs[i+1] = err.Error() + } + strs[size-1] = "\n]" + return strings.Join(strs, "\n\t") +} + +type GithubFileSpec struct { + Path string `json:"path,omitempty"` + Repository struct { + URL string `json:"html_url,omitempty"` + FullName string `json:"full_name,omitempty"` + } `json:"repository,omitempty"` +} + +type githubResponse struct { + // MaxUint is reserved as a sentinel value. + // This is the number of files that match the query. + TotalCount uint64 `json:"total_count,omitempty"` + + // Github representation of a file. + Items []GithubFileSpec `json:"items,omitempty"` +} + +type GithubResponseInfo struct { + *http.Response + Parsed *githubResponse + Error error + NextURL string + LastURL string +} + +func parseGithubLinkFormat(links string) (string, string) { + const ( + linkNext = "next" + linkLast = "last" + linkInfoURL = 1 + linkInfoRel = 2 + ) + + next, last := "", "" + linkInfo := regexp.MustCompile(`<(.*)>.*; rel="(last|next)"`) + + for _, link := range strings.Split(links, ",") { + linkParse := linkInfo.FindStringSubmatch(link) + if len(linkParse) != 3 { + continue + } + + url := linkParse[linkInfoURL] + switch linkParse[linkInfoRel] { + case linkNext: + next = url + case linkLast: + last = url + default: + } + } + + return next, last +} + +func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo { + resp, err := gcl.SearchGithubAPI(getRequest) + requestInfo := GithubResponseInfo{ + Response: resp, + Error: err, + Parsed: nil, + } + + if err != nil || resp == nil { + return requestInfo + } + + var data []byte + defer resp.Body.Close() + data, requestInfo.Error = ioutil.ReadAll(resp.Body) + if requestInfo.Error != nil { + return requestInfo + } + + if resp.StatusCode != http.StatusOK { + logger.Println("query: ", getRequest) + logger.Println("status not OK at the source") + logger.Println("header dump", resp.Header) + logger.Println("body dump", string(data)) + requestInfo.Error = fmt.Errorf("request rejected, status '%s'", + resp.Status) + return requestInfo + } + + requestInfo.NextURL, requestInfo.LastURL = + parseGithubLinkFormat(resp.Header.Get("link")) + + resultCount := githubResponse{ + TotalCount: math.MaxUint64, + } + requestInfo.Error = json.Unmarshal(data, &resultCount) + if requestInfo.Error != nil { + return requestInfo + } + + requestInfo.Parsed = &resultCount + + return requestInfo + +} + +// SearchGithubAPI performs a search query and handles rate limitting for +// the 'code/search?' endpoint as well as timed retries in the case of abuse +// prevention. +func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) { + throttleSearchAPI() + return gcl.getWithRetry(query) +} + +// GetReposData performs a search query and handles rate limitting for +// the '/repos' endpoint as well as timed retries in the case of abuse +// prevention. +func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) { + throttleRepoAPI() + return gcl.getWithRetry(query) +} + +// User content (file contents) is not API rate limited, so there's no use in +// throttling this call. +func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) { + return gcl.getWithRetry(query) +} + +func (gcl GitHubClient) getWithRetry( + query string) (resp *http.Response, err error) { + + resp, err = gcl.client.Get(query) + retryCount := gcl.retryCount + + for err == nil && + resp.StatusCode == http.StatusForbidden && + retryCount > 0 { + + retryTime := resp.Header.Get("Retry-After") + i, err := strconv.Atoi(retryTime) + if err != nil { + return resp, fmt.Errorf( + "query '%s' forbidden without 'Retry-After'", query) + } + logger.Printf( + "status forbidden, retring %d more times\n", retryCount) + + logger.Printf("waiting %d seconds before retrying\n", i) + time.Sleep(time.Second * time.Duration(i)) + retryCount-- + resp, err = gcl.client.Get(query) + } + + if err != nil { + return resp, fmt.Errorf("query '%s' could not be processed, %v", + query, err) + } + + return resp, err +} diff --git a/internal/tools/crawler/github/queries.go b/internal/tools/crawler/github/queries.go new file mode 100644 index 000000000..f8ec23eea --- /dev/null +++ b/internal/tools/crawler/github/queries.go @@ -0,0 +1,219 @@ +package github + +import ( + "fmt" + "net/url" + "strings" +) + +const ( + perPageArg = "per_page" + accessTokenArg = "access_token" + + githubMaxPageSize = 100 +) + +// Implementation detail, not important to external API. +type queryField struct { + name string + value interface{} +} + +// Formats a query field. +func (qf queryField) String() string { + var value string + switch v := qf.value.(type) { + case string: + value = v + case rangeFormatter: + value = v.RangeString() + default: + value = fmt.Sprint(v) + } + + if qf.name == "" { + return value + } + return fmt.Sprint(qf.name, ":", value) +} + +// Example of formating a query: +// QueryWith( +// Filename("kustomization.yaml"), +// Filesize(RangeWithin{64, 192}), +// Keyword("copyright"), +// Keyword("2019"), +// ).String() +// +// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which +// would search for files that have [64, 192] bytes (inclusive range) and that +// contain the keywords 'copyright' and '2019' somewhere in the file. +type Query []queryField + +func QueryWith(qfs ...queryField) Query { + return Query(qfs) +} + +func (q Query) String() string { + strs := make([]string, 0, len(q)) + for _, elem := range q { + str := elem.String() + if str == "" { + continue + } + strs = append(strs, str) + } + + query := strings.Join(strs, "+") + if query == "" { + return query + } + return "q=" + query +} + +// Keyword takes a single word, and formats it according to the Github API. +func Keyword(k string) queryField { + return queryField{value: k} +} + +// Filesize takes a rangeFormatter and formats it according to the Github API. +func Filesize(r rangeFormatter) queryField { + return queryField{name: "size", value: r} +} + +// Filename takes a filename and formats it according to the Github API. +func Filename(f string) queryField { + return queryField{name: "filename", value: f} +} + +// Path takes a filepath and formats it according to the Github API. +func Path(p string) queryField { + return queryField{name: "path", value: p} +} + +// RequestConfig stores common variables that must be present for the queries. +// - CodeSearchRequests: ask Github to check the code indices given a query. +// - ContentsRequests: ask Github where to download a resource given a repo and a +// file path. +// - CommitsRequests: asks Github to list commits made one a file. Useful to +// determine the date of a file. +type RequestConfig struct { + perPage uint64 + accessToken string +} + +func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { + return RequestConfig{ + perPage: perPage, + accessToken: accessToken, + } +} + +// CodeSearchRequestWith given a list of query parameters that specify the +// (patial) query, returns a request object with the (parital) query. Must call +// the URL method to get the string value of the URL. See request.CopyWith, to +// understand why the request object is useful. +func (rc RequestConfig) CodeSearchRequestWith(query Query) request { + req := rc.makeRequest("search/code", query) + req.vals.Set("sort", "indexed") + req.vals.Set("order", "desc") + return req +} + +// ContentsRequest given the repo name, and the filepath returns a formatted +// query for the Github API to find the dowload information of this filepath. +func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path) + return rc.makeRequest(uri, Query{}).URL() +} + +// CommitsRequest given the repo name, and a filepath returns a formatted query +// for the Github API to find the commits that affect this file. +func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/commits", fullRepoName) + return rc.makeRequest(uri, Query{Path(path)}).URL() +} + +func (rc RequestConfig) makeRequest(path string, query Query) request { + vals := url.Values{} + if rc.accessToken != "" { + vals.Set(accessTokenArg, rc.accessToken) + } + vals.Set(perPageArg, fmt.Sprint(rc.perPage)) + + return request{ + url: url.URL{ + Scheme: "https", + Host: "api.github.com", + Path: path, + }, + vals: vals, + query: query, + } +} + +type request struct { + url url.URL + vals url.Values + query Query +} + +// CopyWith copies the requests and adds the extra query parameters. Usefull +// for dynamically adding sizes to a filename only query without modifying it. +func (r request) CopyWith(queryParams ...queryField) request { + cpy := r + cpy.query = append(cpy.query, queryParams...) + return cpy +} + +// URL encodes the variables and the URL representation into a string. +func (r request) URL() string { + // Github does not handle URL encoding properly in its API for the + // q='...', so the query parameter is added without any encoding + // manually. + encoded := r.vals.Encode() + query := r.query.String() + sep := "&" + if query == "" { + sep = "" + } + if encoded == "" && query != "" { + sep = "?" + } + r.url.RawQuery = encoded + sep + query + return r.url.String() +} + +// Allows to define a range of numbers and print it in the github range +// query format https://help.github.com/en/articles/understanding-the-search-syntax. +type rangeFormatter interface { + RangeString() string +} + +// RangeLessThan is a range of values strictly less than (<) size. +type RangeLessThan struct { + size uint64 +} + +func (r RangeLessThan) RangeString() string { + return fmt.Sprintf("<%d", r.size) +} + +// RangeLessThan is a range of values strictly greater than (>) size. +type RangeGreaterThan struct { + size uint64 +} + +func (r RangeGreaterThan) RangeString() string { + return fmt.Sprintf(">%d", r.size) +} + +// RangeWithin is an inclusive range from start to end. +type RangeWithin struct { + start uint64 + end uint64 +} + +func (r RangeWithin) RangeString() string { + return fmt.Sprintf("%d..%d", r.start, r.end) +} diff --git a/internal/tools/crawler/github/queries_test.go b/internal/tools/crawler/github/queries_test.go new file mode 100644 index 000000000..98ef7d564 --- /dev/null +++ b/internal/tools/crawler/github/queries_test.go @@ -0,0 +1,119 @@ +package github + +import ( + "testing" +) + +func TestQueryFields(t *testing.T) { + testCases := []struct { + formatter queryField + expected string + }{ + { + formatter: Keyword("keyword"), + expected: "keyword", + }, + { + formatter: Filesize(RangeLessThan{23}), + expected: "size:<23", + }, + { + formatter: Filesize(RangeWithin{24, 64}), + expected: "size:24..64", + }, + { + formatter: Filesize(RangeGreaterThan{64}), + expected: "size:>64", + }, + { + formatter: Path("some/path/to/file"), + expected: "path:some/path/to/file", + }, + { + formatter: Filename("kustomization.yaml"), + expected: "filename:kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.formatter.String(); result != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected) + } + } +} + +func TestQueryType(t *testing.T) { + testCases := []struct { + query Query + expected string + }{ + { + query: QueryWith( + Filesize(RangeWithin{24, 64}), + Filename("kustomization.yaml"), + Keyword("keyword1"), + Keyword("keyword2"), + ), + expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", + }, + } + + for _, test := range testCases { + if queryStr := test.query.String(); queryStr != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected) + } + + } +} + +func TestGithubSearchQuery(t *testing.T) { + const ( + accessToken = "random_token" + perPage = 100 + ) + + testCases := []struct { + rc RequestConfig + codeQuery Query + fullRepoName string + path string + expectedCodeQuery string + expectedContentsQuery string + expectedCommitsQuery string + }{ + { + rc: RequestConfig{ + perPage: perPage, + accessToken: accessToken, + }, + codeQuery: Query{ + Filename("kustomization.yaml"), + Filesize(RangeWithin{64, 128}), + }, + fullRepoName: "kubernetes-sigs/kustomize", + path: "examples/helloWorld/kustomization.yaml", + + expectedCodeQuery: "https://api.github.com/search/code?" + + "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + + expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + + "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + + expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + + "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery { + t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery) + } + + if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery { + t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery) + } + if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery { + t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery) + } + } +} diff --git a/internal/tools/crawler/github/split_search_ranges.go b/internal/tools/crawler/github/split_search_ranges.go new file mode 100644 index 000000000..58e445997 --- /dev/null +++ b/internal/tools/crawler/github/split_search_ranges.go @@ -0,0 +1,274 @@ +package github + +import ( + "fmt" + "math/bits" +) + +// Files cannot be more than 2^19 bytes, according to +// https://help.github.com/en/articles/searching-code#considerations-for-code-search +const ( + githubMaxFileSize = uint64(1 << 19) + githubMaxResultsPerQuery = uint64(1000) +) + +// Interface for testing purposes. Not expecting to have multiple +// implementations. +type cachedSearch interface { + CountResults(uint64) (uint64, error) + RequestString(filesize rangeFormatter) string +} + +// Cache uses bit tricks to be more efficient in detecting +// inconsistencies in the returned data from the Github API. +// Therefore, the cache expects a search to always start at 0, and +// it expects the max file size to be a power of 2. If this is to be changed +// there are a few considerations to keep in mind: +// +// 1. The cache is only efficient if the queries can be reused, so if +// the first chunk of files lives in the range 0..x, continuing the +// search for the next chunk from x+1..max (while asymptotically sane) +// may actually be less efficient since the cache is essentially reset +// at every interval. This leads to a larger number of requests in +// practice, and requests are what's expensive (rate limits). +// +// 2. The github API is not perfectly monotonic.. (this is somewhat +// problematic). The current cache implementation looks at the +// predecessor entry to find out if the current value is monotonic. +// This is where the bit trick is used, since each step in the binary +// search is adding or ommiting to add a decreasing of 2 to the query value, +// we can remove the least significant set bit to find the predecessor in +// constant time. Ultimately since the search is rate limited, we could also +// easily afford to compute this in linear time by iterating +// over cached values. +type githubCachedSearch struct { + cache map[uint64]uint64 + gcl GitHubClient + baseRequest request +} + +func newCache(client GitHubClient, query Query) githubCachedSearch { + return githubCachedSearch{ + cache: map[uint64]uint64{ + 0: 0, + }, + gcl: client, + baseRequest: client.CodeSearchRequestWith(query), + } +} + +func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { + count, cached := c.cache[upperBound] + if cached { + return count, nil + } + + sizeRange := RangeWithin{0, upperBound} + rangeRequest := c.RequestString(sizeRange) + + result := c.gcl.parseGithubResponse(rangeRequest) + if result.Error != nil { + return count, result.Error + } + + // As range search uses powers of 2 for binary search, the previously + // cached value is easy to find by removing the least significant set + // bit from the current upperBound, since each step of the search adds + // least significant set bit. + // + // Finding the predecessor could also be implemented by iterating over + // the map to find the largest key that is smaller than upperBound if + // this approach deemed too complex. + trail := bits.TrailingZeros64(upperBound) + prev := uint64(0) + if trail != 64 { + prev = upperBound - (1 << uint64(trail)) + } + + // Sometimes the github API is not monotonically increasing, or ouputs + // an erroneous value of 0, or 1. This logic makes sure that it was not + // erroneous, and that the sequence continues to be monotonic by setting + // the current query count to match the previous value. which at least + // guarantees that the range search terminates. + // + // On the other hand, if files are added, then we way loose out on some + // files in a reviously completed range, but these files should be there + // the next time the crawler runs, so this is not really problematic. + retryMonotonicCount := 4 + for result.Parsed.TotalCount < c.cache[prev] { + logger.Printf( + "Retrying query... current lower bound: %d, got: %d\n", + c.cache[prev], result.Parsed.TotalCount) + + result = c.gcl.parseGithubResponse(rangeRequest) + if result.Error != nil { + return count, result.Error + } + + retryMonotonicCount-- + if retryMonotonicCount <= 0 { + result.Parsed.TotalCount = c.cache[prev] + logger.Println( + "Retries for monotonic check exceeded,", + " setting value to match predecessor") + } + } + + count = result.Parsed.TotalCount + logger.Printf("Caching new query %s, with count %d\n", + sizeRange.RangeString(), count) + c.cache[upperBound] = count + return count, nil +} + +func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { + return c.baseRequest.CopyWith(Filesize(filesize)).URL() +} + +// Outputs a (possibly incomplete) list of ranges to query to find most search +// results as permissible by the search github search API. Github search only +// allows 1,000 results per query (paginated). +// Source: https://developer.github.com/v3/search/ +// +// This leaves the possibility of having file sizes with more than 1000 results, +// This would mean that the search as it is could not find all files. If queries +// are sorted by last indexed, and retrieved on regular intervals, it should be +// sufficient to get most if not all documents. +func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { + totalFiles, err := cache.CountResults(githubMaxFileSize) + if err != nil { + return nil, err + } + logger.Println("total files: ", totalFiles) + + if githubMaxResultsPerQuery >= totalFiles { + return []string{ + cache.RequestString(RangeWithin{0, githubMaxFileSize}), + }, nil + } + + // Find all the ranges of file sizes such that all files are queryable + // using the Github API. This does not compute an optimal ranges, since + // the number of queries needed to get the information required to + // compute an optimal range is expected to be much larger than the + // number of queries performed this way. + // + // The number of ranges is k = (number of files)/1000, and finding a + // range is logarithmic in the max file size (n = filesize). This means + // that preprocessing takes O(k * lg n) queries to find the ranges with + // a binary search over file sizes. + // + // My intuition is that this approach is competitive to a perfectly + // optimal solution, but I didn't actually take the time to do a + // rigurous proof. Intuitively, since files sizes are typically power + // law distibuted the binary search will be very skewed towards the + // smaller file ranges. This means that in practice this approach will + // make fewer than (#files/1000)*(log(n) = 19) queries for + // preprocessing, since it reuses a lot of the queries in the denser + // ranges. Furthermore, because of the distribution, it should be very + // easy to find ranges that are very close to the upper bound, up to + // the limiting factor of having no more than 1000 files accessible per + // range. + filesAccessible := uint64(0) + sizes := make([]uint64, 0) + for filesAccessible < totalFiles { + target := filesAccessible + githubMaxResultsPerQuery + if target >= totalFiles { + break + } + + logger.Printf("%d accessible files, next target = %d\n", + filesAccessible, target) + + cur, err := lowerBoundFileCount(cache, target) + if err != nil { + return nil, err + } + + // If there are more than 1000 files in the next bucket, we must + // advance anyway and lose out on some files :(. + if l := len(sizes); l > 0 && sizes[l-1] == cur { + cur++ + } + + nextAccessible, err := cache.CountResults(cur) + if err != nil { + return nil, fmt.Errorf( + "cache should be populated at %d already, got %v", + cur, err) + } + if nextAccessible < filesAccessible { + return nil, fmt.Errorf( + "Number of results dropped from %d to %d within range search", + filesAccessible, nextAccessible) + } + + filesAccessible = nextAccessible + if nextAccessible < totalFiles { + sizes = append(sizes, cur) + } + } + + return formatFilesizeRanges(cache, sizes), nil +} + +// lowerBoundFileCount finds the filesize range from [0, return value] that has +// the largest file count that is smaller than or equal to +// githubMaxResultsPerQuery. It is important to note that this returned value +// could already be in a previous range if the next file size has more than 1000 +// results. It is left to the caller to handle this bit of logic and guarantee +// forward progession in this case. +func lowerBoundFileCount( + cache cachedSearch, targetFileCount uint64) (uint64, error) { + + // Binary search for file sizes that make up the next <=1000 element + // chunk. + cur := uint64(0) + increase := githubMaxFileSize / 2 + + for increase > 0 { + mid := cur + increase + + count, err := cache.CountResults(mid) + if err != nil { + return count, err + } + + if count <= targetFileCount { + cur = mid + } + + if count == targetFileCount { + break + } + + increase /= 2 + } + + return cur, nil +} + +func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { + ranges := make([]string, 0, len(sizes)+1) + + if len(sizes) > 0 { + ranges = append(ranges, cache.RequestString( + RangeLessThan{sizes[0] + 1}, + )) + } + + for i := 0; i < len(sizes)-1; i += 1 { + ranges = append(ranges, cache.RequestString( + RangeWithin{sizes[i] + 1, sizes[i+1]}, + )) + + if i != len(sizes)-2 { + continue + } + ranges = append(ranges, cache.RequestString( + RangeGreaterThan{sizes[i+1]}, + )) + } + + return ranges +} diff --git a/internal/tools/crawler/github/split_search_ranges_test.go b/internal/tools/crawler/github/split_search_ranges_test.go new file mode 100644 index 000000000..c175486e6 --- /dev/null +++ b/internal/tools/crawler/github/split_search_ranges_test.go @@ -0,0 +1,90 @@ +package github + +import ( + "fmt" + "reflect" + "testing" +) + +type testCachedSearch struct { + cache map[uint64]uint64 +} + +func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { + fmt.Printf("CountResults(%05x)\n", upperBound) + count, ok := c.cache[upperBound] + if !ok { + return count, fmt.Errorf("cache not set at %x", upperBound) + } + return count, nil +} + +func (c testCachedSearch) RequestString(filesize rangeFormatter) string { + return filesize.RangeString() +} + +// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test +// cache take in a list of (filesize, count) pairs and it can populate the cache +// without relying on how the implementation will create queries. This was only +// a quick and dirty test to make sure that modifications are not going to break +// the functionality. +func TestRangeSplitting(t *testing.T) { + // Keys follow the binary search depending on whether or not the range + // is too small/large to find close to optimal filesize ranges. This + // test is heavily tied to the fact that the search is using powers of two + // to make progress in the search (hence the use of hexadecimal values). + cache := testCachedSearch{ + map[uint64]uint64{ + 0x80000: 5000, + 0x40000: 5000, + 0x20000: 5000, + 0x10000: 5000, + 0x08000: 5000, + 0x04000: 5000, + 0x02000: 5000, + 0x01000: 5000, + 0x00fff: 3950, + 0x00ffe: 3950, + 0x00ffc: 3950, + 0x00ff8: 3950, + 0x00ff0: 3950, + 0x00fe0: 3950, + 0x00fc0: 3950, + 0x00f80: 3950, + 0x00f00: 3950, + 0x00e00: 3950, + 0x00c00: 3950, + 0x00800: 3950, + 0x00400: 3950, + 0x00200: 3688, + 0x00180: 3028, + 0x00100: 2999, + 0x000c0: 2448, + 0x00080: 1999, + 0x00070: 1600, + 0x0006c: 1003, + 0x0006b: 1001, + 0x0006a: 999, + 0x00068: 999, + 0x00060: 999, + 0x00040: 999, + 0x00000: 0, + }, + } + + requests, err := FindRangesForRepoSearch(cache) + if err != nil { + t.Errorf("Error while finding ranges: %v", err) + } + expected := []string{ + "<107", // cache.RequestString(RangeLessThan{0x6b}), + "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), + "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), + "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), + ">4095", // cache.RequestString(RangeGreaterThan{0xfff}), + } + + if !reflect.DeepEqual(requests, expected) { + t.Errorf("Expected requests (%v) to equal (%v)", requests, expected) + } +} diff --git a/internal/tools/doc/doc.go b/internal/tools/doc/doc.go new file mode 100644 index 000000000..38aa6505b --- /dev/null +++ b/internal/tools/doc/doc.go @@ -0,0 +1,115 @@ +package doc + +import ( + "fmt" + "strings" + "time" + + "sigs.k8s.io/yaml" +) + +// This document is meant to be used at the elasticsearch document type. +// Fields are serialized as-is to elasticsearch, where indices are built +// to facilitate text search queries. Identifiers, Values, FilePath, +// RepositoryURL and DocumentData are meant to be searched for text queries +// directly, while the other fields can either be used as a filter, or as +// additional metadata displayed in the UI. +// +// The fields of the document and their purpose are listed below: +// - DocumentData contains the contents of the kustomization file. +// - Kinds Represents the kubernetes Kinds that are in this file. +// - Identifiers are a list of (partial and full) identifier paths that can be +// found by users. Each part of a path is delimited by ":" e.g. spec:replicas. +// - Values are a list of identifier paths and their values that can be found by +// search queries. The path is delimited by ":" and the value follows the "=" +// symbol e.g. spec:replicas=4. +// - FilePath is the path of the file. +// - RepositoryURL is the URL of the source repository. +// - CreationTime is the time at which the file was created. +// +// Representing each Identifier and Value as a flat string representation +// facilitates the use of complex text search features from elasticsearch such +// as fuzzy searching, regex, wildcards, etc. +type KustomizationDocument struct { + DocumentData string `json:"document,omitempty"` + Kinds []string `json:"kinds,omitempty"` + Identifiers []string `json:"identifiers,omitempty"` + Values []string `json:"values,omitempty"` + FilePath string `json:"filePath,omitempty"` + RepositoryURL string `json:"repositoryUrl,omitempty"` + CreationTime time.Time `json:"creationTime,omitempty"` +} + +func (doc *KustomizationDocument) ParseYAML() error { + doc.Identifiers = make([]string, 0) + doc.Values = make([]string, 0) + + var kustomization map[string]interface{} + err := yaml.Unmarshal([]byte(doc.DocumentData), &kustomization) + if err != nil { + return fmt.Errorf("unable to parse kustomization file: %s", err) + } + + type Map struct { + data map[string]interface{} + prefix string + } + + toVisit := []Map{ + { + data: kustomization, + prefix: "", + }, + } + + identifierSet := make(map[string]struct{}) + valueSet := make(map[string]struct{}) + for i := 0; i < len(toVisit); i++ { + visiting := toVisit[i] + for k, v := range visiting.data { + identifier := fmt.Sprintf("%s:%s", visiting.prefix, + strings.Replace(k, ":", "%3A", -1)) + // noop after the first iteration. + identifier = strings.TrimLeft(identifier, ":") + + // Recursive function traverses structure to find + // identifiers and values. These later get formatted + // into doc.Identifiers and doc.Values respectively. + var traverseStructure func(interface{}) + traverseStructure = func(arg interface{}) { + switch value := arg.(type) { + case map[string]interface{}: + toVisit = append(toVisit, Map{ + data: value, + prefix: identifier, + }) + case []interface{}: + for _, val := range value { + traverseStructure(val) + } + case interface{}: + esc := strings.Replace(fmt.Sprintf("%v", + value), ":", "%3A", -1) + + valuePath := fmt.Sprintf("%s=%v", + identifier, esc) + valueSet[valuePath] = struct{}{} + } + } + traverseStructure(v) + + identifierSet[identifier] = struct{}{} + + } + } + + for val := range valueSet { + doc.Values = append(doc.Values, val) + } + + for key := range identifierSet { + doc.Identifiers = append(doc.Identifiers, key) + } + + return nil +} diff --git a/internal/tools/doc/doc_test.go b/internal/tools/doc/doc_test.go new file mode 100644 index 000000000..919d2bd94 --- /dev/null +++ b/internal/tools/doc/doc_test.go @@ -0,0 +1,110 @@ +package doc + +import ( + "reflect" + "sort" + "strings" + "testing" +) + +func TestParseYAML(t *testing.T) { + testCases := []struct { + identifiers []string + values []string + yaml string + }{ + { + identifiers: []string{ + "namePrefix", + "metadata", + "metadata:name", + "kind", + }, + values: []string{ + "namePrefix=dev-", + "metadata:name=app", + "kind=Deployment", + }, + yaml: ` +namePrefix: dev- +metadata: + name: app +kind: Deployment +`, + }, + { + identifiers: []string{ + "namePrefix", + "metadata", + "metadata:name", + "metadata:spec", + "metadata:spec:replicas", + "kind", + "replicas", + "replicas:name", + "replicas:count", + "resource", + }, + values: []string{ + "namePrefix=dev-", + "metadata:name=n1", + "metadata:spec:replicas=3", + "kind=Deployment", + "replicas:name=n1", + "replicas:name=n2", + "replicas:count=3", + "resource=file1.yaml", + "resource=file2.yaml", + }, + yaml: ` +namePrefix: dev- +# map of map +metadata: + name: n1 + spec: + replicas: 3 +kind: Deployment + +#list of map +replicas: +- name: n1 + count: 3 +- name: n2 + count: 3 + +# list +resource: +- file1.yaml +- file2.yaml +`, + }, + } + + for _, test := range testCases { + doc := KustomizationDocument{ + DocumentData: test.yaml, + FilePath: "example/path/kustomization.yaml", + } + + err := doc.ParseYAML() + if err != nil { + t.Errorf("Document error error: %s", err) + } + + cmpStrings := func(got, expected []string, label string) { + sort.Strings(got) + sort.Strings(expected) + + if !reflect.DeepEqual(got, expected) { + t.Errorf("Expected %s (%v) to be equal to (%v)\n", + label, + strings.Join(got, ","), + strings.Join(expected, ",")) + } + + } + + cmpStrings(doc.Identifiers, test.identifiers, "identifiers") + cmpStrings(doc.Values, test.values, "values") + } +} diff --git a/internal/tools/go.mod b/internal/tools/go.mod new file mode 100644 index 000000000..68ae4f06b --- /dev/null +++ b/internal/tools/go.mod @@ -0,0 +1,10 @@ +module sigs.k8s.io/kustomize/internal/tools + +go 1.12 + +require ( + github.com/gomodule/redigo v2.0.0+incompatible + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 + gopkg.in/yaml.v2 v2.2.2 // indirect + sigs.k8s.io/yaml v1.1.0 +) diff --git a/internal/tools/go.sum b/internal/tools/go.sum new file mode 100644 index 000000000..1a7e918fd --- /dev/null +++ b/internal/tools/go.sum @@ -0,0 +1,10 @@ +github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0= +github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= +sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= diff --git a/internal/tools/httpclient/httpclient.go b/internal/tools/httpclient/httpclient.go new file mode 100644 index 000000000..417c587fb --- /dev/null +++ b/internal/tools/httpclient/httpclient.go @@ -0,0 +1,19 @@ +package httpclient + +import ( + "net/http" + "time" + + "github.com/gomodule/redigo/redis" + "github.com/gregjones/httpcache" + redis_cache "github.com/gregjones/httpcache/redis" +) + +func NewClient(conn redis.Conn) *http.Client { + etagCache := redis_cache.NewWithClient(conn) + tr := httpcache.NewTransport(etagCache) + return &http.Client{ + Transport: tr, + Timeout: 10 * time.Second, + } +}