Merge pull request #1450 from damienr74/master

Add internal tooling for kustomize
This commit is contained in:
Jeff Regan
2019-08-16 14:49:37 -07:00
committed by GitHub
12 changed files with 1644 additions and 0 deletions

View File

@@ -0,0 +1,76 @@
// Package crawler provides helper methods and defines an interface for lauching
// source repository crawlers that retrieve files from a source and forwards
// to a channel for indexing and retrieval.
package crawler
import (
"context"
"fmt"
"sync"
"sigs.k8s.io/kustomize/internal/tools/doc"
)
// Crawler forwards documents from source repositories to index and store them
// for searching. Each crawler is responsible for querying it's source of
// information, and forwarding files that have not been seen before or that need
// updating.
type Crawler interface {
// Crawl returns when it is done processing. This method does not take
// ownership of the channel. The channel is write only, and it
// designates where the crawler should forward the documents.
Crawl(ctx context.Context, output chan<- *doc.KustomizationDocument) error
}
// CrawlerRunner is a blocking function and only returns once all of the
// crawlers are finished with execution.
//
// This function uses the output channel to forward kustomization documents
// from a list of crawlers. The output is to be consumed by a database/search
// indexer for later retrieval.
//
// The return value is an array of errors in which each index represents the
// index of the crawler that emitted the error. Although the errors themselves
// can be nil, the array will always be exactly the size of the crawlers array.
func CrawlerRunner(ctx context.Context,
output chan<- *doc.KustomizationDocument, crawlers []Crawler) []error {
errs := make([]error, len(crawlers))
wg := sync.WaitGroup{}
for i, crawler := range crawlers {
// Crawler implementations get their own channels to prevent a
// crawler from closing the main output channel.
docs := make(chan *doc.KustomizationDocument)
wg.Add(2)
// Forward all of the documents from this crawler's channel to
// the main output channel.
go func(docs <-chan *doc.KustomizationDocument) {
defer wg.Done()
for doc := range docs {
output <- doc
}
}(docs)
// Run this crawler and capture its returned error.
go func(idx int, crawler Crawler,
docs chan<- *doc.KustomizationDocument) {
defer func() {
wg.Done()
if r := recover(); r != nil {
errs[idx] = fmt.Errorf(
"%+v panicked: %v, additional error %v",
crawler, r, errs[idx],
)
}
}()
defer close(docs)
errs[idx] = crawler.Crawl(ctx, docs)
}(i, crawler, docs) // Copies the index and the crawler
}
wg.Wait()
return errs
}

View File

@@ -0,0 +1,124 @@
package crawler
import (
"context"
"errors"
"reflect"
"sort"
"sync"
"testing"
"sigs.k8s.io/kustomize/internal/tools/doc"
)
// Simple crawler that forwards it's list of documents to a provided channel and
// returns it's error to the caller.
type testCrawler struct {
docs []doc.KustomizationDocument
err error
}
// Crawl implements the Crawler interface for testing.
func (c testCrawler) Crawl(ctx context.Context,
output chan<- *doc.KustomizationDocument) error {
for i := range c.docs {
output <- &c.docs[i]
}
return c.err
}
// Used to make sure that we're comparing documents in order. This is needed
// since these documents will be sent concurrently.
type sortableDocs []doc.KustomizationDocument
func (s sortableDocs) Less(i, j int) bool {
return s[i].FilePath < s[j].FilePath
}
func (s sortableDocs) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s sortableDocs) Len() int {
return len(s)
}
func TestCrawlerRunner(t *testing.T) {
tests := []struct {
tc []Crawler
errs []error
docs sortableDocs
}{
{
tc: []Crawler{
testCrawler{
docs: []doc.KustomizationDocument{
{FilePath: "crawler1/doc1"},
{FilePath: "crawler1/doc2"},
{FilePath: "crawler1/doc3"},
},
},
testCrawler{err: errors.New("crawler2")},
testCrawler{},
testCrawler{
docs: []doc.KustomizationDocument{
{FilePath: "crawler4/doc1"},
{FilePath: "crawler4/doc2"},
},
err: errors.New("crawler4"),
},
},
errs: []error{
nil,
errors.New("crawler2"),
nil,
errors.New("crawler4"),
},
docs: sortableDocs{
{FilePath: "crawler1/doc1"},
{FilePath: "crawler1/doc2"},
{FilePath: "crawler1/doc3"},
{FilePath: "crawler4/doc1"},
{FilePath: "crawler4/doc2"},
},
},
}
for _, test := range tests {
output := make(chan *doc.KustomizationDocument)
wg := sync.WaitGroup{}
wg.Add(1)
// Run the Crawler runner with a list of crawlers.
go func() {
defer close(output)
defer wg.Done()
errs := CrawlerRunner(context.Background(), output,
test.tc)
// Check that errors are returned as they should be.
if !reflect.DeepEqual(errs, test.errs) {
t.Errorf("Expected errs (%v) to equal (%v)",
errs, test.errs)
}
}()
// Iterate over the output channel of Crawler runner.
returned := make(sortableDocs, 0, len(test.docs))
for doc := range output {
returned = append(returned, *doc)
}
// Check that all documents are received.
sort.Sort(returned)
if !reflect.DeepEqual(returned, test.docs) {
t.Errorf("Expected docs (%v) to equal (%v)\n",
returned, test.docs)
}
wg.Wait()
}
}

View File

@@ -0,0 +1,478 @@
// Package github implements the crawler.Crawler interface, getting data
// from the Github search API.
package github
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"math"
"net/http"
"os"
"regexp"
"strconv"
"strings"
"time"
"sigs.k8s.io/kustomize/internal/tools/doc"
)
var logger = log.New(os.Stdout, "Github Crawler: ",
log.LstdFlags|log.LUTC|log.Llongfile)
// Implements crawler.Crawler.
type githubCrawler struct {
client GitHubClient
query Query
}
type GitHubClient struct {
RequestConfig
retryCount uint64
client *http.Client
}
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler {
return githubCrawler{
client: GitHubClient{
retryCount: retryCount,
client: client,
RequestConfig: RequestConfig{
perPage: githubMaxPageSize,
accessToken: accessToken,
},
},
query: query,
}
}
// Implements crawler.Crawler.
func (gc githubCrawler) Crawl(
ctx context.Context, output chan<- *doc.KustomizationDocument) error {
noETagClient := GitHubClient{
RequestConfig: gc.client.RequestConfig,
client: &http.Client{Timeout: gc.client.client.Timeout},
retryCount: gc.client.retryCount,
}
// Since Github returns a max of 1000 results per query, we can use
// multiple queries that split the search space into chunks of at most
// 1000 files to get all of the data.
ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
if err != nil {
return fmt.Errorf("could not split %v into ranges, %v\n",
gc.query, err)
}
logger.Println("ranges: ", ranges)
// Query each range for files.
errs := make(multiError, 0)
for _, query := range ranges {
err := processQuery(ctx, gc.client, query, output)
if err != nil {
errs = append(errs, err)
}
}
return errs
}
// processQuery follows all of the pages in a query, and updates/adds the
// documents from the crawl to the datastore/index.
func processQuery(ctx context.Context, gcl GitHubClient, query string,
output chan<- *doc.KustomizationDocument) error {
queryPages := make(chan GithubResponseInfo)
go func() {
// Forward the document metadata to the retrieval channel.
// This separation allows for concurrent requests for the code
// search, and the retrieval portions of the API.
err := gcl.ForwardPaginatedQuery(ctx, query, queryPages)
if err != nil {
// TODO(damienr74) handle this error with redis?
logger.Println(err)
}
close(queryPages)
}()
errs := make(multiError, 0)
errorCnt := 0
totalCnt := 0
for page := range queryPages {
if page.Error != nil {
errs = append(errs, page.Error)
continue
}
for _, file := range page.Parsed.Items {
// TODO(damienr74) This is where we'd need to
// communicate with redis. Currently always doing a full
// reindex of the documents. Since the documents are in
// sorted order in each bucket, we can short circuit the
// search when we find a file that has been seen, or we
// can choose to selectively update files.
k, err := kustomizationResultAdapter(gcl, file)
if err != nil {
errs = append(errs, err)
errorCnt++
continue
}
output <- k
totalCnt++
}
logger.Printf("got %d files out of %d from API. %d of %d had errors\n",
totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt)
}
return errs
}
func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
*doc.KustomizationDocument, error) {
data, err := gcl.GetFileData(k)
if err != nil {
return nil, err
}
creationTime, err := gcl.GetFileCreationTime(k)
if err != nil {
logger.Printf("(error: %v) initializing to current time.", err)
}
doc := doc.KustomizationDocument{
DocumentData: string(data),
FilePath: k.Path,
RepositoryURL: k.Repository.URL,
CreationTime: creationTime,
}
return &doc, nil
}
// ForwardPaginatedQuery follows the links to the next pages and performs all of
// the queries for a given search query, relaying the data from each request
// back to an output channel.
func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string,
output chan<- GithubResponseInfo) error {
logger.Println("querying: ", query)
response := gcl.parseGithubResponse(query)
if response.Error != nil {
return response.Error
}
output <- response
for response.LastURL != "" && response.NextURL != "" {
select {
case <-ctx.Done():
return nil
default:
response = gcl.parseGithubResponse(response.NextURL)
if response.Error != nil {
return response.Error
}
output <- response
}
}
return nil
}
// GetFileData gets the bytes from a file.
func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
url := gcl.ContentsRequest(k.Repository.FullName, k.Path)
resp, err := gcl.GetReposData(url)
if err != nil {
return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v",
k, url, err)
}
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v",
k, url, err)
}
resp.Body.Close()
type githubContentRawURL struct {
DownloadURL string `json:"download_url,omitempty"`
}
var rawURL githubContentRawURL
err = json.Unmarshal(data, &rawURL)
if err != nil {
return nil, fmt.Errorf(
"%+v: could not get 'download_url' from '%s' response: %v",
k, data, err)
}
resp, err = gcl.GetRawUserContent(rawURL.DownloadURL)
if err != nil {
return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v",
k, rawURL.DownloadURL, err)
}
defer resp.Body.Close()
return ioutil.ReadAll(resp.Body)
}
// GetFileCreationTime gets the earliest date of a file.
func (gcl GitHubClient) GetFileCreationTime(
k GithubFileSpec) (time.Time, error) {
url := gcl.CommitsRequest(k.Repository.FullName, k.Path)
defaultTime := time.Now()
resp, err := gcl.GetReposData(url)
if err != nil {
return defaultTime, fmt.Errorf(
"%+v: '%s' could not get metadata: %v", k, url, err)
}
type DateSpec struct {
Commit struct {
Author struct {
Date string `json:"date,omitempty"`
} `json:"author,omitempty"`
} `json:"commit,omitempty"`
}
_, lastURL := parseGithubLinkFormat(resp.Header.Get("link"))
if lastURL != "" {
resp, err = gcl.GetReposData(lastURL)
if err != nil {
return defaultTime, fmt.Errorf(
"%+v: '%s' could not get metadata: %v",
k, lastURL, err)
}
}
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return defaultTime, fmt.Errorf(
"%+v: failed to read metadata: %v", k, err)
}
earliestDate := []DateSpec{}
err = json.Unmarshal(data, &earliestDate)
size := len(earliestDate)
if err != nil || size == 0 {
return defaultTime, fmt.Errorf(
"%+v: server response '%s' not in expected format: %v",
k, data, err)
}
return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date)
}
// TODO(damienr74) change the tickers to actually check api rate limits, reset
// times, and throttle requests dynamically based off of current utilization,
// instead of hardcoding the documented values, these calls are not quota'd.
// This is now especially important, since caching the API requests will reduce
// API quota use (so we can actually make more requests in the allotted time
// period).
//
// See https://developer.github.com/v3/rate_limit/ for details.
var (
searchRateTicker = time.NewTicker(time.Second * 2)
contentRateTicker = time.NewTicker(time.Second * 1)
)
func throttleSearchAPI() {
<-searchRateTicker.C
}
func throttleRepoAPI() {
<-contentRateTicker.C
}
const (
accessTokenKeyword = "access_token="
perPageKeyword = "per_page="
contentSearchURL = "https://api.github.com/repos"
contentKeyword = "contents"
)
type multiError []error
func (me multiError) Error() string {
size := len(me) + 2
strs := make([]string, size)
strs[0] = "Errors [\n\t"
for i, err := range me {
strs[i+1] = err.Error()
}
strs[size-1] = "\n]"
return strings.Join(strs, "\n\t")
}
type GithubFileSpec struct {
Path string `json:"path,omitempty"`
Repository struct {
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
} `json:"repository,omitempty"`
}
type githubResponse struct {
// MaxUint is reserved as a sentinel value.
// This is the number of files that match the query.
TotalCount uint64 `json:"total_count,omitempty"`
// Github representation of a file.
Items []GithubFileSpec `json:"items,omitempty"`
}
type GithubResponseInfo struct {
*http.Response
Parsed *githubResponse
Error error
NextURL string
LastURL string
}
func parseGithubLinkFormat(links string) (string, string) {
const (
linkNext = "next"
linkLast = "last"
linkInfoURL = 1
linkInfoRel = 2
)
next, last := "", ""
linkInfo := regexp.MustCompile(`<(.*)>.*; rel="(last|next)"`)
for _, link := range strings.Split(links, ",") {
linkParse := linkInfo.FindStringSubmatch(link)
if len(linkParse) != 3 {
continue
}
url := linkParse[linkInfoURL]
switch linkParse[linkInfoRel] {
case linkNext:
next = url
case linkLast:
last = url
default:
}
}
return next, last
}
func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo {
resp, err := gcl.SearchGithubAPI(getRequest)
requestInfo := GithubResponseInfo{
Response: resp,
Error: err,
Parsed: nil,
}
if err != nil || resp == nil {
return requestInfo
}
var data []byte
defer resp.Body.Close()
data, requestInfo.Error = ioutil.ReadAll(resp.Body)
if requestInfo.Error != nil {
return requestInfo
}
if resp.StatusCode != http.StatusOK {
logger.Println("query: ", getRequest)
logger.Println("status not OK at the source")
logger.Println("header dump", resp.Header)
logger.Println("body dump", string(data))
requestInfo.Error = fmt.Errorf("request rejected, status '%s'",
resp.Status)
return requestInfo
}
requestInfo.NextURL, requestInfo.LastURL =
parseGithubLinkFormat(resp.Header.Get("link"))
resultCount := githubResponse{
TotalCount: math.MaxUint64,
}
requestInfo.Error = json.Unmarshal(data, &resultCount)
if requestInfo.Error != nil {
return requestInfo
}
requestInfo.Parsed = &resultCount
return requestInfo
}
// SearchGithubAPI performs a search query and handles rate limitting for
// the 'code/search?' endpoint as well as timed retries in the case of abuse
// prevention.
func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) {
throttleSearchAPI()
return gcl.getWithRetry(query)
}
// GetReposData performs a search query and handles rate limitting for
// the '/repos' endpoint as well as timed retries in the case of abuse
// prevention.
func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) {
throttleRepoAPI()
return gcl.getWithRetry(query)
}
// User content (file contents) is not API rate limited, so there's no use in
// throttling this call.
func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) {
return gcl.getWithRetry(query)
}
func (gcl GitHubClient) getWithRetry(
query string) (resp *http.Response, err error) {
resp, err = gcl.client.Get(query)
retryCount := gcl.retryCount
for err == nil &&
resp.StatusCode == http.StatusForbidden &&
retryCount > 0 {
retryTime := resp.Header.Get("Retry-After")
i, err := strconv.Atoi(retryTime)
if err != nil {
return resp, fmt.Errorf(
"query '%s' forbidden without 'Retry-After'", query)
}
logger.Printf(
"status forbidden, retring %d more times\n", retryCount)
logger.Printf("waiting %d seconds before retrying\n", i)
time.Sleep(time.Second * time.Duration(i))
retryCount--
resp, err = gcl.client.Get(query)
}
if err != nil {
return resp, fmt.Errorf("query '%s' could not be processed, %v",
query, err)
}
return resp, err
}

View File

@@ -0,0 +1,219 @@
package github
import (
"fmt"
"net/url"
"strings"
)
const (
perPageArg = "per_page"
accessTokenArg = "access_token"
githubMaxPageSize = 100
)
// Implementation detail, not important to external API.
type queryField struct {
name string
value interface{}
}
// Formats a query field.
func (qf queryField) String() string {
var value string
switch v := qf.value.(type) {
case string:
value = v
case rangeFormatter:
value = v.RangeString()
default:
value = fmt.Sprint(v)
}
if qf.name == "" {
return value
}
return fmt.Sprint(qf.name, ":", value)
}
// Example of formating a query:
// QueryWith(
// Filename("kustomization.yaml"),
// Filesize(RangeWithin{64, 192}),
// Keyword("copyright"),
// Keyword("2019"),
// ).String()
//
// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which
// would search for files that have [64, 192] bytes (inclusive range) and that
// contain the keywords 'copyright' and '2019' somewhere in the file.
type Query []queryField
func QueryWith(qfs ...queryField) Query {
return Query(qfs)
}
func (q Query) String() string {
strs := make([]string, 0, len(q))
for _, elem := range q {
str := elem.String()
if str == "" {
continue
}
strs = append(strs, str)
}
query := strings.Join(strs, "+")
if query == "" {
return query
}
return "q=" + query
}
// Keyword takes a single word, and formats it according to the Github API.
func Keyword(k string) queryField {
return queryField{value: k}
}
// Filesize takes a rangeFormatter and formats it according to the Github API.
func Filesize(r rangeFormatter) queryField {
return queryField{name: "size", value: r}
}
// Filename takes a filename and formats it according to the Github API.
func Filename(f string) queryField {
return queryField{name: "filename", value: f}
}
// Path takes a filepath and formats it according to the Github API.
func Path(p string) queryField {
return queryField{name: "path", value: p}
}
// RequestConfig stores common variables that must be present for the queries.
// - CodeSearchRequests: ask Github to check the code indices given a query.
// - ContentsRequests: ask Github where to download a resource given a repo and a
// file path.
// - CommitsRequests: asks Github to list commits made one a file. Useful to
// determine the date of a file.
type RequestConfig struct {
perPage uint64
accessToken string
}
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
return RequestConfig{
perPage: perPage,
accessToken: accessToken,
}
}
// CodeSearchRequestWith given a list of query parameters that specify the
// (patial) query, returns a request object with the (parital) query. Must call
// the URL method to get the string value of the URL. See request.CopyWith, to
// understand why the request object is useful.
func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
req := rc.makeRequest("search/code", query)
req.vals.Set("sort", "indexed")
req.vals.Set("order", "desc")
return req
}
// ContentsRequest given the repo name, and the filepath returns a formatted
// query for the Github API to find the dowload information of this filepath.
func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path)
return rc.makeRequest(uri, Query{}).URL()
}
// CommitsRequest given the repo name, and a filepath returns a formatted query
// for the Github API to find the commits that affect this file.
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/commits", fullRepoName)
return rc.makeRequest(uri, Query{Path(path)}).URL()
}
func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
return request{
url: url.URL{
Scheme: "https",
Host: "api.github.com",
Path: path,
},
vals: vals,
query: query,
}
}
type request struct {
url url.URL
vals url.Values
query Query
}
// CopyWith copies the requests and adds the extra query parameters. Usefull
// for dynamically adding sizes to a filename only query without modifying it.
func (r request) CopyWith(queryParams ...queryField) request {
cpy := r
cpy.query = append(cpy.query, queryParams...)
return cpy
}
// URL encodes the variables and the URL representation into a string.
func (r request) URL() string {
// Github does not handle URL encoding properly in its API for the
// q='...', so the query parameter is added without any encoding
// manually.
encoded := r.vals.Encode()
query := r.query.String()
sep := "&"
if query == "" {
sep = ""
}
if encoded == "" && query != "" {
sep = "?"
}
r.url.RawQuery = encoded + sep + query
return r.url.String()
}
// Allows to define a range of numbers and print it in the github range
// query format https://help.github.com/en/articles/understanding-the-search-syntax.
type rangeFormatter interface {
RangeString() string
}
// RangeLessThan is a range of values strictly less than (<) size.
type RangeLessThan struct {
size uint64
}
func (r RangeLessThan) RangeString() string {
return fmt.Sprintf("<%d", r.size)
}
// RangeLessThan is a range of values strictly greater than (>) size.
type RangeGreaterThan struct {
size uint64
}
func (r RangeGreaterThan) RangeString() string {
return fmt.Sprintf(">%d", r.size)
}
// RangeWithin is an inclusive range from start to end.
type RangeWithin struct {
start uint64
end uint64
}
func (r RangeWithin) RangeString() string {
return fmt.Sprintf("%d..%d", r.start, r.end)
}

View File

@@ -0,0 +1,119 @@
package github
import (
"testing"
)
func TestQueryFields(t *testing.T) {
testCases := []struct {
formatter queryField
expected string
}{
{
formatter: Keyword("keyword"),
expected: "keyword",
},
{
formatter: Filesize(RangeLessThan{23}),
expected: "size:<23",
},
{
formatter: Filesize(RangeWithin{24, 64}),
expected: "size:24..64",
},
{
formatter: Filesize(RangeGreaterThan{64}),
expected: "size:>64",
},
{
formatter: Path("some/path/to/file"),
expected: "path:some/path/to/file",
},
{
formatter: Filename("kustomization.yaml"),
expected: "filename:kustomization.yaml",
},
}
for _, test := range testCases {
if result := test.formatter.String(); result != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected)
}
}
}
func TestQueryType(t *testing.T) {
testCases := []struct {
query Query
expected string
}{
{
query: QueryWith(
Filesize(RangeWithin{24, 64}),
Filename("kustomization.yaml"),
Keyword("keyword1"),
Keyword("keyword2"),
),
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
},
}
for _, test := range testCases {
if queryStr := test.query.String(); queryStr != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected)
}
}
}
func TestGithubSearchQuery(t *testing.T) {
const (
accessToken = "random_token"
perPage = 100
)
testCases := []struct {
rc RequestConfig
codeQuery Query
fullRepoName string
path string
expectedCodeQuery string
expectedContentsQuery string
expectedCommitsQuery string
}{
{
rc: RequestConfig{
perPage: perPage,
accessToken: accessToken,
},
codeQuery: Query{
Filename("kustomization.yaml"),
Filesize(RangeWithin{64, 128}),
},
fullRepoName: "kubernetes-sigs/kustomize",
path: "examples/helloWorld/kustomization.yaml",
expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
},
}
for _, test := range testCases {
if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery {
t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery)
}
if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery {
t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery)
}
if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery {
t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery)
}
}
}

View File

@@ -0,0 +1,274 @@
package github
import (
"fmt"
"math/bits"
)
// Files cannot be more than 2^19 bytes, according to
// https://help.github.com/en/articles/searching-code#considerations-for-code-search
const (
githubMaxFileSize = uint64(1 << 19)
githubMaxResultsPerQuery = uint64(1000)
)
// Interface for testing purposes. Not expecting to have multiple
// implementations.
type cachedSearch interface {
CountResults(uint64) (uint64, error)
RequestString(filesize rangeFormatter) string
}
// Cache uses bit tricks to be more efficient in detecting
// inconsistencies in the returned data from the Github API.
// Therefore, the cache expects a search to always start at 0, and
// it expects the max file size to be a power of 2. If this is to be changed
// there are a few considerations to keep in mind:
//
// 1. The cache is only efficient if the queries can be reused, so if
// the first chunk of files lives in the range 0..x, continuing the
// search for the next chunk from x+1..max (while asymptotically sane)
// may actually be less efficient since the cache is essentially reset
// at every interval. This leads to a larger number of requests in
// practice, and requests are what's expensive (rate limits).
//
// 2. The github API is not perfectly monotonic.. (this is somewhat
// problematic). The current cache implementation looks at the
// predecessor entry to find out if the current value is monotonic.
// This is where the bit trick is used, since each step in the binary
// search is adding or ommiting to add a decreasing of 2 to the query value,
// we can remove the least significant set bit to find the predecessor in
// constant time. Ultimately since the search is rate limited, we could also
// easily afford to compute this in linear time by iterating
// over cached values.
type githubCachedSearch struct {
cache map[uint64]uint64
gcl GitHubClient
baseRequest request
}
func newCache(client GitHubClient, query Query) githubCachedSearch {
return githubCachedSearch{
cache: map[uint64]uint64{
0: 0,
},
gcl: client,
baseRequest: client.CodeSearchRequestWith(query),
}
}
func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
count, cached := c.cache[upperBound]
if cached {
return count, nil
}
sizeRange := RangeWithin{0, upperBound}
rangeRequest := c.RequestString(sizeRange)
result := c.gcl.parseGithubResponse(rangeRequest)
if result.Error != nil {
return count, result.Error
}
// As range search uses powers of 2 for binary search, the previously
// cached value is easy to find by removing the least significant set
// bit from the current upperBound, since each step of the search adds
// least significant set bit.
//
// Finding the predecessor could also be implemented by iterating over
// the map to find the largest key that is smaller than upperBound if
// this approach deemed too complex.
trail := bits.TrailingZeros64(upperBound)
prev := uint64(0)
if trail != 64 {
prev = upperBound - (1 << uint64(trail))
}
// Sometimes the github API is not monotonically increasing, or ouputs
// an erroneous value of 0, or 1. This logic makes sure that it was not
// erroneous, and that the sequence continues to be monotonic by setting
// the current query count to match the previous value. which at least
// guarantees that the range search terminates.
//
// On the other hand, if files are added, then we way loose out on some
// files in a reviously completed range, but these files should be there
// the next time the crawler runs, so this is not really problematic.
retryMonotonicCount := 4
for result.Parsed.TotalCount < c.cache[prev] {
logger.Printf(
"Retrying query... current lower bound: %d, got: %d\n",
c.cache[prev], result.Parsed.TotalCount)
result = c.gcl.parseGithubResponse(rangeRequest)
if result.Error != nil {
return count, result.Error
}
retryMonotonicCount--
if retryMonotonicCount <= 0 {
result.Parsed.TotalCount = c.cache[prev]
logger.Println(
"Retries for monotonic check exceeded,",
" setting value to match predecessor")
}
}
count = result.Parsed.TotalCount
logger.Printf("Caching new query %s, with count %d\n",
sizeRange.RangeString(), count)
c.cache[upperBound] = count
return count, nil
}
func (c githubCachedSearch) RequestString(filesize rangeFormatter) string {
return c.baseRequest.CopyWith(Filesize(filesize)).URL()
}
// Outputs a (possibly incomplete) list of ranges to query to find most search
// results as permissible by the search github search API. Github search only
// allows 1,000 results per query (paginated).
// Source: https://developer.github.com/v3/search/
//
// This leaves the possibility of having file sizes with more than 1000 results,
// This would mean that the search as it is could not find all files. If queries
// are sorted by last indexed, and retrieved on regular intervals, it should be
// sufficient to get most if not all documents.
func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
totalFiles, err := cache.CountResults(githubMaxFileSize)
if err != nil {
return nil, err
}
logger.Println("total files: ", totalFiles)
if githubMaxResultsPerQuery >= totalFiles {
return []string{
cache.RequestString(RangeWithin{0, githubMaxFileSize}),
}, nil
}
// Find all the ranges of file sizes such that all files are queryable
// using the Github API. This does not compute an optimal ranges, since
// the number of queries needed to get the information required to
// compute an optimal range is expected to be much larger than the
// number of queries performed this way.
//
// The number of ranges is k = (number of files)/1000, and finding a
// range is logarithmic in the max file size (n = filesize). This means
// that preprocessing takes O(k * lg n) queries to find the ranges with
// a binary search over file sizes.
//
// My intuition is that this approach is competitive to a perfectly
// optimal solution, but I didn't actually take the time to do a
// rigurous proof. Intuitively, since files sizes are typically power
// law distibuted the binary search will be very skewed towards the
// smaller file ranges. This means that in practice this approach will
// make fewer than (#files/1000)*(log(n) = 19) queries for
// preprocessing, since it reuses a lot of the queries in the denser
// ranges. Furthermore, because of the distribution, it should be very
// easy to find ranges that are very close to the upper bound, up to
// the limiting factor of having no more than 1000 files accessible per
// range.
filesAccessible := uint64(0)
sizes := make([]uint64, 0)
for filesAccessible < totalFiles {
target := filesAccessible + githubMaxResultsPerQuery
if target >= totalFiles {
break
}
logger.Printf("%d accessible files, next target = %d\n",
filesAccessible, target)
cur, err := lowerBoundFileCount(cache, target)
if err != nil {
return nil, err
}
// If there are more than 1000 files in the next bucket, we must
// advance anyway and lose out on some files :(.
if l := len(sizes); l > 0 && sizes[l-1] == cur {
cur++
}
nextAccessible, err := cache.CountResults(cur)
if err != nil {
return nil, fmt.Errorf(
"cache should be populated at %d already, got %v",
cur, err)
}
if nextAccessible < filesAccessible {
return nil, fmt.Errorf(
"Number of results dropped from %d to %d within range search",
filesAccessible, nextAccessible)
}
filesAccessible = nextAccessible
if nextAccessible < totalFiles {
sizes = append(sizes, cur)
}
}
return formatFilesizeRanges(cache, sizes), nil
}
// lowerBoundFileCount finds the filesize range from [0, return value] that has
// the largest file count that is smaller than or equal to
// githubMaxResultsPerQuery. It is important to note that this returned value
// could already be in a previous range if the next file size has more than 1000
// results. It is left to the caller to handle this bit of logic and guarantee
// forward progession in this case.
func lowerBoundFileCount(
cache cachedSearch, targetFileCount uint64) (uint64, error) {
// Binary search for file sizes that make up the next <=1000 element
// chunk.
cur := uint64(0)
increase := githubMaxFileSize / 2
for increase > 0 {
mid := cur + increase
count, err := cache.CountResults(mid)
if err != nil {
return count, err
}
if count <= targetFileCount {
cur = mid
}
if count == targetFileCount {
break
}
increase /= 2
}
return cur, nil
}
func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string {
ranges := make([]string, 0, len(sizes)+1)
if len(sizes) > 0 {
ranges = append(ranges, cache.RequestString(
RangeLessThan{sizes[0] + 1},
))
}
for i := 0; i < len(sizes)-1; i += 1 {
ranges = append(ranges, cache.RequestString(
RangeWithin{sizes[i] + 1, sizes[i+1]},
))
if i != len(sizes)-2 {
continue
}
ranges = append(ranges, cache.RequestString(
RangeGreaterThan{sizes[i+1]},
))
}
return ranges
}

View File

@@ -0,0 +1,90 @@
package github
import (
"fmt"
"reflect"
"testing"
)
type testCachedSearch struct {
cache map[uint64]uint64
}
func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
fmt.Printf("CountResults(%05x)\n", upperBound)
count, ok := c.cache[upperBound]
if !ok {
return count, fmt.Errorf("cache not set at %x", upperBound)
}
return count, nil
}
func (c testCachedSearch) RequestString(filesize rangeFormatter) string {
return filesize.RangeString()
}
// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test
// cache take in a list of (filesize, count) pairs and it can populate the cache
// without relying on how the implementation will create queries. This was only
// a quick and dirty test to make sure that modifications are not going to break
// the functionality.
func TestRangeSplitting(t *testing.T) {
// Keys follow the binary search depending on whether or not the range
// is too small/large to find close to optimal filesize ranges. This
// test is heavily tied to the fact that the search is using powers of two
// to make progress in the search (hence the use of hexadecimal values).
cache := testCachedSearch{
map[uint64]uint64{
0x80000: 5000,
0x40000: 5000,
0x20000: 5000,
0x10000: 5000,
0x08000: 5000,
0x04000: 5000,
0x02000: 5000,
0x01000: 5000,
0x00fff: 3950,
0x00ffe: 3950,
0x00ffc: 3950,
0x00ff8: 3950,
0x00ff0: 3950,
0x00fe0: 3950,
0x00fc0: 3950,
0x00f80: 3950,
0x00f00: 3950,
0x00e00: 3950,
0x00c00: 3950,
0x00800: 3950,
0x00400: 3950,
0x00200: 3688,
0x00180: 3028,
0x00100: 2999,
0x000c0: 2448,
0x00080: 1999,
0x00070: 1600,
0x0006c: 1003,
0x0006b: 1001,
0x0006a: 999,
0x00068: 999,
0x00060: 999,
0x00040: 999,
0x00000: 0,
},
}
requests, err := FindRangesForRepoSearch(cache)
if err != nil {
t.Errorf("Error while finding ranges: %v", err)
}
expected := []string{
"<107", // cache.RequestString(RangeLessThan{0x6b}),
"107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}),
"129..256", // cache.RequestString(RangeWithin{0x81, 0x100}),
"257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}),
">4095", // cache.RequestString(RangeGreaterThan{0xfff}),
}
if !reflect.DeepEqual(requests, expected) {
t.Errorf("Expected requests (%v) to equal (%v)", requests, expected)
}
}

115
internal/tools/doc/doc.go Normal file
View File

@@ -0,0 +1,115 @@
package doc
import (
"fmt"
"strings"
"time"
"sigs.k8s.io/yaml"
)
// This document is meant to be used at the elasticsearch document type.
// Fields are serialized as-is to elasticsearch, where indices are built
// to facilitate text search queries. Identifiers, Values, FilePath,
// RepositoryURL and DocumentData are meant to be searched for text queries
// directly, while the other fields can either be used as a filter, or as
// additional metadata displayed in the UI.
//
// The fields of the document and their purpose are listed below:
// - DocumentData contains the contents of the kustomization file.
// - Kinds Represents the kubernetes Kinds that are in this file.
// - Identifiers are a list of (partial and full) identifier paths that can be
// found by users. Each part of a path is delimited by ":" e.g. spec:replicas.
// - Values are a list of identifier paths and their values that can be found by
// search queries. The path is delimited by ":" and the value follows the "="
// symbol e.g. spec:replicas=4.
// - FilePath is the path of the file.
// - RepositoryURL is the URL of the source repository.
// - CreationTime is the time at which the file was created.
//
// Representing each Identifier and Value as a flat string representation
// facilitates the use of complex text search features from elasticsearch such
// as fuzzy searching, regex, wildcards, etc.
type KustomizationDocument struct {
DocumentData string `json:"document,omitempty"`
Kinds []string `json:"kinds,omitempty"`
Identifiers []string `json:"identifiers,omitempty"`
Values []string `json:"values,omitempty"`
FilePath string `json:"filePath,omitempty"`
RepositoryURL string `json:"repositoryUrl,omitempty"`
CreationTime time.Time `json:"creationTime,omitempty"`
}
func (doc *KustomizationDocument) ParseYAML() error {
doc.Identifiers = make([]string, 0)
doc.Values = make([]string, 0)
var kustomization map[string]interface{}
err := yaml.Unmarshal([]byte(doc.DocumentData), &kustomization)
if err != nil {
return fmt.Errorf("unable to parse kustomization file: %s", err)
}
type Map struct {
data map[string]interface{}
prefix string
}
toVisit := []Map{
{
data: kustomization,
prefix: "",
},
}
identifierSet := make(map[string]struct{})
valueSet := make(map[string]struct{})
for i := 0; i < len(toVisit); i++ {
visiting := toVisit[i]
for k, v := range visiting.data {
identifier := fmt.Sprintf("%s:%s", visiting.prefix,
strings.Replace(k, ":", "%3A", -1))
// noop after the first iteration.
identifier = strings.TrimLeft(identifier, ":")
// Recursive function traverses structure to find
// identifiers and values. These later get formatted
// into doc.Identifiers and doc.Values respectively.
var traverseStructure func(interface{})
traverseStructure = func(arg interface{}) {
switch value := arg.(type) {
case map[string]interface{}:
toVisit = append(toVisit, Map{
data: value,
prefix: identifier,
})
case []interface{}:
for _, val := range value {
traverseStructure(val)
}
case interface{}:
esc := strings.Replace(fmt.Sprintf("%v",
value), ":", "%3A", -1)
valuePath := fmt.Sprintf("%s=%v",
identifier, esc)
valueSet[valuePath] = struct{}{}
}
}
traverseStructure(v)
identifierSet[identifier] = struct{}{}
}
}
for val := range valueSet {
doc.Values = append(doc.Values, val)
}
for key := range identifierSet {
doc.Identifiers = append(doc.Identifiers, key)
}
return nil
}

View File

@@ -0,0 +1,110 @@
package doc
import (
"reflect"
"sort"
"strings"
"testing"
)
func TestParseYAML(t *testing.T) {
testCases := []struct {
identifiers []string
values []string
yaml string
}{
{
identifiers: []string{
"namePrefix",
"metadata",
"metadata:name",
"kind",
},
values: []string{
"namePrefix=dev-",
"metadata:name=app",
"kind=Deployment",
},
yaml: `
namePrefix: dev-
metadata:
name: app
kind: Deployment
`,
},
{
identifiers: []string{
"namePrefix",
"metadata",
"metadata:name",
"metadata:spec",
"metadata:spec:replicas",
"kind",
"replicas",
"replicas:name",
"replicas:count",
"resource",
},
values: []string{
"namePrefix=dev-",
"metadata:name=n1",
"metadata:spec:replicas=3",
"kind=Deployment",
"replicas:name=n1",
"replicas:name=n2",
"replicas:count=3",
"resource=file1.yaml",
"resource=file2.yaml",
},
yaml: `
namePrefix: dev-
# map of map
metadata:
name: n1
spec:
replicas: 3
kind: Deployment
#list of map
replicas:
- name: n1
count: 3
- name: n2
count: 3
# list
resource:
- file1.yaml
- file2.yaml
`,
},
}
for _, test := range testCases {
doc := KustomizationDocument{
DocumentData: test.yaml,
FilePath: "example/path/kustomization.yaml",
}
err := doc.ParseYAML()
if err != nil {
t.Errorf("Document error error: %s", err)
}
cmpStrings := func(got, expected []string, label string) {
sort.Strings(got)
sort.Strings(expected)
if !reflect.DeepEqual(got, expected) {
t.Errorf("Expected %s (%v) to be equal to (%v)\n",
label,
strings.Join(got, ","),
strings.Join(expected, ","))
}
}
cmpStrings(doc.Identifiers, test.identifiers, "identifiers")
cmpStrings(doc.Values, test.values, "values")
}
}

10
internal/tools/go.mod Normal file
View File

@@ -0,0 +1,10 @@
module sigs.k8s.io/kustomize/internal/tools
go 1.12
require (
github.com/gomodule/redigo v2.0.0+incompatible
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79
gopkg.in/yaml.v2 v2.2.2 // indirect
sigs.k8s.io/yaml v1.1.0
)

10
internal/tools/go.sum Normal file
View File

@@ -0,0 +1,10 @@
github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0=
github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs=
sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=

View File

@@ -0,0 +1,19 @@
package httpclient
import (
"net/http"
"time"
"github.com/gomodule/redigo/redis"
"github.com/gregjones/httpcache"
redis_cache "github.com/gregjones/httpcache/redis"
)
func NewClient(conn redis.Conn) *http.Client {
etagCache := redis_cache.NewWithClient(conn)
tr := httpcache.NewTransport(etagCache)
return &http.Client{
Transport: tr,
Timeout: 10 * time.Second,
}
}