From fe45157b2606d0316f4da3e018bed85ea9ddef5f Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 30 Jul 2019 16:01:00 -0700 Subject: [PATCH] Update crawler to cache web request form github. - Increase logging signal to noise ratio. - Allow to specify the `http.Client` for github requests. (This allows the use of caching http.Clients). - Clean up implementation. --- internal/search/crawler/github/crawler.go | 180 ++++++++++++------ internal/search/crawler/github/queries.go | 12 +- .../crawler/github/split_search_ranges.go | 12 +- internal/search/go.mod | 2 + internal/search/go.sum | 4 + internal/search/httpclient/httpclient.go | 19 ++ 6 files changed, 150 insertions(+), 79 deletions(-) create mode 100644 internal/search/httpclient/httpclient.go diff --git a/internal/search/crawler/github/crawler.go b/internal/search/crawler/github/crawler.go index 8c7c81f57..1195ed07d 100644 --- a/internal/search/crawler/github/crawler.go +++ b/internal/search/crawler/github/crawler.go @@ -24,18 +24,27 @@ var logger = log.New(os.Stdout, "Github Crawler: ", // Implements crawler.Crawler. type githubCrawler struct { - rc RequestConfig - query Query + client GitHubClient + query Query } -func NewCrawler( - accessToken string, retryCount uint64, query Query) githubCrawler { +type GitHubClient struct { + RequestConfig + retryCount uint64 + client *http.Client +} + +func NewCrawler(accessToken string, retryCount uint64, client *http.Client, + query Query) githubCrawler { return githubCrawler{ - rc: RequestConfig{ - perPage: githubMaxPageSize, - retryCount: retryCount, - accessToken: accessToken, + client: GitHubClient{ + retryCount: retryCount, + client: client, + RequestConfig: RequestConfig{ + perPage: githubMaxPageSize, + accessToken: accessToken, + }, }, query: query, } @@ -45,19 +54,27 @@ func NewCrawler( func (gc githubCrawler) Crawl( ctx context.Context, output chan<- *doc.KustomizationDocument) error { + noETagClient := GitHubClient{ + RequestConfig: gc.client.RequestConfig, + client: &http.Client{Timeout: gc.client.client.Timeout}, + retryCount: gc.client.retryCount, + } + // Since Github returns a max of 1000 results per query, we can use // multiple queries that split the search space into chunks of at most // 1000 files to get all of the data. - ranges, err := FindRangesForRepoSearch(newCache(gc.rc, gc.query)) + ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query)) if err != nil { - return fmt.Errorf("could not split search into ranges, %v\n", - err) + return fmt.Errorf("could not split %v into ranges, %v\n", + gc.query, err) } + logger.Println("ranges: ", ranges) + // Query each range for files. errs := make(multiError, 0) for _, query := range ranges { - err := processQuery(ctx, gc.rc, query, output) + err := processQuery(ctx, gc.client, query, output) if err != nil { errs = append(errs, err) } @@ -68,7 +85,7 @@ func (gc githubCrawler) Crawl( // processQuery follows all of the pages in a query, and updates/adds the // documents from the crawl to the datastore/index. -func processQuery(ctx context.Context, rc RequestConfig, query string, +func processQuery(ctx context.Context, gcl GitHubClient, query string, output chan<- *doc.KustomizationDocument) error { queryPages := make(chan GithubResponseInfo) @@ -77,8 +94,7 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, // Forward the document metadata to the retrieval channel. // This separation allows for concurrent requests for the code // search, and the retrieval portions of the API. - err := ForwardPaginatedQuery( - ctx, query, rc.retryCount, queryPages) + err := gcl.ForwardPaginatedQuery(ctx, query, queryPages) if err != nil { // TODO(damienr74) handle this error with redis? logger.Println(err) @@ -87,6 +103,8 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, }() errs := make(multiError, 0) + errorCnt := 0 + totalCnt := 0 for page := range queryPages { if page.Error != nil { errs = append(errs, page.Error) @@ -101,35 +119,40 @@ func processQuery(ctx context.Context, rc RequestConfig, query string, // search when we find a file that has been seen, or we // can choose to selectively update files. - k, err := kustomizationResultAdapter(rc, file) + k, err := kustomizationResultAdapter(gcl, file) if err != nil { errs = append(errs, err) + errorCnt++ continue } output <- k + totalCnt++ } + + logger.Printf("got %d files out of %d from API. %d of %d had errors\n", + totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt) } return errs } -func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( +func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) ( *doc.KustomizationDocument, error) { - data, err := GetFileData(rc, k) + data, err := gcl.GetFileData(k) if err != nil { return nil, err } - creationTime, err := GetFileCreationTime(rc, k) + creationTime, err := gcl.GetFileCreationTime(k) if err != nil { - logger.Printf("(Error: %v) initializing to current time.", err) + logger.Printf("(error: %v) initializing to current time.", err) } doc := doc.KustomizationDocument{ DocumentData: string(data), - FilePath: doc.Atom(k.Path), - RepositoryURL: doc.Atom(k.Repository.URL), + FilePath: k.Path, + RepositoryURL: k.Repository.URL, CreationTime: creationTime, } @@ -139,10 +162,12 @@ func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( // ForwardPaginatedQuery follows the links to the next pages and performs all of // the queries for a given search query, relaying the data from each request // back to an output channel. -func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, +func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string, output chan<- GithubResponseInfo) error { - response := parseGithubResponse(query, retryCount) + logger.Println("querying: ", query) + response := gcl.parseGithubResponse(query) + if response.Error != nil { return response.Error } @@ -154,7 +179,7 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, case <-ctx.Done(): return nil default: - response = parseGithubResponse(response.NextURL, retryCount) + response = gcl.parseGithubResponse(response.NextURL) if response.Error != nil { return response.Error } @@ -167,20 +192,22 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, } // GetFileData gets the bytes from a file. -func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { +func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) { - url := rc.ContentsRequest(k.Repository.FullName, k.Path) + url := gcl.ContentsRequest(k.Repository.FullName, k.Path) - logger.Println("content-url ", url) - resp, err := GetReposData(url, rc.RetryCount()) + resp, err := gcl.GetReposData(url) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v", + k, url, err) } data, err := ioutil.ReadAll(resp.Body) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v", + k, url, err) } + resp.Body.Close() type githubContentRawURL struct { DownloadURL string `json:"download_url,omitempty"` @@ -188,30 +215,33 @@ func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { var rawURL githubContentRawURL err = json.Unmarshal(data, &rawURL) if err != nil { - return nil, err + return nil, fmt.Errorf( + "%+v: could not get 'download_url' from '%s' response: %v", + k, data, err) } - logger.Println("raw-data-url", rawURL.DownloadURL) - resp, err = GetReposData(rawURL.DownloadURL, rc.RetryCount()) + resp, err = gcl.GetRawUserContent(rawURL.DownloadURL) if err != nil { - return nil, err + return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v", + k, rawURL.DownloadURL, err) } + defer resp.Body.Close() return ioutil.ReadAll(resp.Body) } // GetFileCreationTime gets the earliest date of a file. -func GetFileCreationTime( - rc RequestConfig, k GithubFileSpec) (time.Time, error) { +func (gcl GitHubClient) GetFileCreationTime( + k GithubFileSpec) (time.Time, error) { - url := rc.CommitsRequest(k.Repository.FullName, k.Path) + url := gcl.CommitsRequest(k.Repository.FullName, k.Path) defaultTime := time.Now() - logger.Println("commits-url", url) - resp, err := GetReposData(url, rc.RetryCount()) + resp, err := gcl.GetReposData(url) if err != nil { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", k, url, err) } type DateSpec struct { @@ -224,18 +254,27 @@ func GetFileCreationTime( _, lastURL := parseGithubLinkFormat(resp.Header.Get("link")) if lastURL != "" { - resp, err = GetReposData(lastURL, rc.RetryCount()) + resp, err = gcl.GetReposData(lastURL) if err != nil { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: '%s' could not get metadata: %v", + k, lastURL, err) } } + defer resp.Body.Close() data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return defaultTime, fmt.Errorf( + "%+v: failed to read metadata: %v", k, err) + } earliestDate := []DateSpec{} err = json.Unmarshal(data, &earliestDate) size := len(earliestDate) if err != nil || size == 0 { - return defaultTime, err + return defaultTime, fmt.Errorf( + "%+v: server response '%s' not in expected format: %v", + k, data, err) } return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date) @@ -244,6 +283,9 @@ func GetFileCreationTime( // TODO(damienr74) change the tickers to actually check api rate limits, reset // times, and throttle requests dynamically based off of current utilization, // instead of hardcoding the documented values, these calls are not quota'd. +// This is now especially important, since caching the API requests will reduce +// API quota use (so we can actually make more requests in the allotted time +// period). // // See https://developer.github.com/v3/rate_limit/ for details. var ( @@ -334,8 +376,8 @@ func parseGithubLinkFormat(links string) (string, string) { return next, last } -func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInfo { - resp, err := SearchGithubAPI(getRequest, retryCount) +func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo { + resp, err := gcl.SearchGithubAPI(getRequest) requestInfo := GithubResponseInfo{ Response: resp, Error: err, @@ -347,17 +389,18 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf } var data []byte + defer resp.Body.Close() data, requestInfo.Error = ioutil.ReadAll(resp.Body) if requestInfo.Error != nil { return requestInfo } if resp.StatusCode != http.StatusOK { - logger.Println("Query: ", getRequest) - logger.Println("Status not OK at the source") - logger.Println("Header Dump", resp.Header) - logger.Println("Body Dump", string(data)) - requestInfo.Error = fmt.Errorf("Request Rejected, Status '%s'", + logger.Println("query: ", getRequest) + logger.Println("status not OK at the source") + logger.Println("header dump", resp.Header) + logger.Println("body dump", string(data)) + requestInfo.Error = fmt.Errorf("request rejected, status '%s'", resp.Status) return requestInfo } @@ -382,23 +425,30 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf // SearchGithubAPI performs a search query and handles rate limitting for // the 'code/search?' endpoint as well as timed retries in the case of abuse // prevention. -func SearchGithubAPI(query string, retryCount uint64) (*http.Response, error) { +func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) { throttleSearchAPI() - return getWithRetry(query, retryCount) + return gcl.getWithRetry(query) } // GetReposData performs a search query and handles rate limitting for // the '/repos' endpoint as well as timed retries in the case of abuse // prevention. -func GetReposData(query string, retryCount uint64) (*http.Response, error) { +func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) { throttleRepoAPI() - return getWithRetry(query, retryCount) + return gcl.getWithRetry(query) } -func getWithRetry( - query string, retryCount uint64) (resp *http.Response, err error) { +// User content (file contents) is not API rate limited, so there's no use in +// throttling this call. +func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) { + return gcl.getWithRetry(query) +} - resp, err = http.Get(query) +func (gcl GitHubClient) getWithRetry( + query string) (resp *http.Response, err error) { + + resp, err = gcl.client.Get(query) + retryCount := gcl.retryCount for err == nil && resp.StatusCode == http.StatusForbidden && @@ -407,15 +457,21 @@ func getWithRetry( retryTime := resp.Header.Get("Retry-After") i, err := strconv.Atoi(retryTime) if err != nil { - return resp, fmt.Errorf("Forbidden without 'Retry-After'") + return resp, fmt.Errorf( + "query '%s' forbidden without 'Retry-After'", query) } logger.Printf( - "Status Forbidden, retring %d more times\n", retryCount) + "status forbidden, retring %d more times\n", retryCount) - logger.Printf("Waiting %d seconds before retrying\n", i) + logger.Printf("waiting %d seconds before retrying\n", i) time.Sleep(time.Second * time.Duration(i)) retryCount-- - resp, err = http.Get(query) + resp, err = gcl.client.Get(query) + } + + if err != nil { + return resp, fmt.Errorf("query '%s' could not be processed, %v", + query, err) } return resp, err diff --git a/internal/search/crawler/github/queries.go b/internal/search/crawler/github/queries.go index 7dc307914..f8ec23eea 100644 --- a/internal/search/crawler/github/queries.go +++ b/internal/search/crawler/github/queries.go @@ -99,16 +99,12 @@ func Path(p string) queryField { // determine the date of a file. type RequestConfig struct { perPage uint64 - retryCount uint64 accessToken string } -func NewRequestConfig( - perPage, retryCount uint64, accessToken string) RequestConfig { - +func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { return RequestConfig{ perPage: perPage, - retryCount: retryCount, accessToken: accessToken, } } @@ -138,12 +134,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { return rc.makeRequest(uri, Query{Path(path)}).URL() } -// How many times to retry the queries before giving up (used by the crawler, -// not Github). -func (rc RequestConfig) RetryCount() uint64 { - return rc.retryCount -} - func (rc RequestConfig) makeRequest(path string, query Query) request { vals := url.Values{} if rc.accessToken != "" { diff --git a/internal/search/crawler/github/split_search_ranges.go b/internal/search/crawler/github/split_search_ranges.go index c098fa6ac..58e445997 100644 --- a/internal/search/crawler/github/split_search_ranges.go +++ b/internal/search/crawler/github/split_search_ranges.go @@ -43,17 +43,17 @@ type cachedSearch interface { // over cached values. type githubCachedSearch struct { cache map[uint64]uint64 - retryCount uint64 + gcl GitHubClient baseRequest request } -func newCache(rc RequestConfig, query Query) githubCachedSearch { +func newCache(client GitHubClient, query Query) githubCachedSearch { return githubCachedSearch{ cache: map[uint64]uint64{ 0: 0, }, - retryCount: rc.RetryCount(), - baseRequest: rc.CodeSearchRequestWith(query), + gcl: client, + baseRequest: client.CodeSearchRequestWith(query), } } @@ -66,7 +66,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { sizeRange := RangeWithin{0, upperBound} rangeRequest := c.RequestString(sizeRange) - result := parseGithubResponse(rangeRequest, c.retryCount) + result := c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } @@ -100,7 +100,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { "Retrying query... current lower bound: %d, got: %d\n", c.cache[prev], result.Parsed.TotalCount) - result = parseGithubResponse(rangeRequest, c.retryCount) + result = c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } diff --git a/internal/search/go.mod b/internal/search/go.mod index e408ff68c..0d7da9db8 100644 --- a/internal/search/go.mod +++ b/internal/search/go.mod @@ -3,6 +3,8 @@ module sigs.k8s.io/kustomize/internal/search go 1.12 require ( + github.com/gomodule/redigo v2.0.0+incompatible + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 gopkg.in/yaml.v2 v2.2.2 // indirect sigs.k8s.io/yaml v1.1.0 ) diff --git a/internal/search/go.sum b/internal/search/go.sum index 60aa01b56..1a7e918fd 100644 --- a/internal/search/go.sum +++ b/internal/search/go.sum @@ -1,3 +1,7 @@ +github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0= +github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= diff --git a/internal/search/httpclient/httpclient.go b/internal/search/httpclient/httpclient.go new file mode 100644 index 000000000..417c587fb --- /dev/null +++ b/internal/search/httpclient/httpclient.go @@ -0,0 +1,19 @@ +package httpclient + +import ( + "net/http" + "time" + + "github.com/gomodule/redigo/redis" + "github.com/gregjones/httpcache" + redis_cache "github.com/gregjones/httpcache/redis" +) + +func NewClient(conn redis.Conn) *http.Client { + etagCache := redis_cache.NewWithClient(conn) + tr := httpcache.NewTransport(etagCache) + return &http.Client{ + Transport: tr, + Timeout: 10 * time.Second, + } +}