Update crawler to cache web request form github.

- Increase logging signal to noise ratio.
- Allow to specify the `http.Client` for github requests. (This allows
 the use of caching http.Clients).
- Clean up implementation.
This commit is contained in:
Damien Robichaud
2019-07-30 16:01:00 -07:00
parent df779fd720
commit fe45157b26
6 changed files with 150 additions and 79 deletions

View File

@@ -24,19 +24,28 @@ var logger = log.New(os.Stdout, "Github Crawler: ",
// Implements crawler.Crawler. // Implements crawler.Crawler.
type githubCrawler struct { type githubCrawler struct {
rc RequestConfig client GitHubClient
query Query query Query
} }
func NewCrawler( type GitHubClient struct {
accessToken string, retryCount uint64, query Query) githubCrawler { RequestConfig
retryCount uint64
client *http.Client
}
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler {
return githubCrawler{ return githubCrawler{
rc: RequestConfig{ client: GitHubClient{
perPage: githubMaxPageSize,
retryCount: retryCount, retryCount: retryCount,
client: client,
RequestConfig: RequestConfig{
perPage: githubMaxPageSize,
accessToken: accessToken, accessToken: accessToken,
}, },
},
query: query, query: query,
} }
} }
@@ -45,19 +54,27 @@ func NewCrawler(
func (gc githubCrawler) Crawl( func (gc githubCrawler) Crawl(
ctx context.Context, output chan<- *doc.KustomizationDocument) error { ctx context.Context, output chan<- *doc.KustomizationDocument) error {
noETagClient := GitHubClient{
RequestConfig: gc.client.RequestConfig,
client: &http.Client{Timeout: gc.client.client.Timeout},
retryCount: gc.client.retryCount,
}
// Since Github returns a max of 1000 results per query, we can use // Since Github returns a max of 1000 results per query, we can use
// multiple queries that split the search space into chunks of at most // multiple queries that split the search space into chunks of at most
// 1000 files to get all of the data. // 1000 files to get all of the data.
ranges, err := FindRangesForRepoSearch(newCache(gc.rc, gc.query)) ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
if err != nil { if err != nil {
return fmt.Errorf("could not split search into ranges, %v\n", return fmt.Errorf("could not split %v into ranges, %v\n",
err) gc.query, err)
} }
logger.Println("ranges: ", ranges)
// Query each range for files. // Query each range for files.
errs := make(multiError, 0) errs := make(multiError, 0)
for _, query := range ranges { for _, query := range ranges {
err := processQuery(ctx, gc.rc, query, output) err := processQuery(ctx, gc.client, query, output)
if err != nil { if err != nil {
errs = append(errs, err) errs = append(errs, err)
} }
@@ -68,7 +85,7 @@ func (gc githubCrawler) Crawl(
// processQuery follows all of the pages in a query, and updates/adds the // processQuery follows all of the pages in a query, and updates/adds the
// documents from the crawl to the datastore/index. // documents from the crawl to the datastore/index.
func processQuery(ctx context.Context, rc RequestConfig, query string, func processQuery(ctx context.Context, gcl GitHubClient, query string,
output chan<- *doc.KustomizationDocument) error { output chan<- *doc.KustomizationDocument) error {
queryPages := make(chan GithubResponseInfo) queryPages := make(chan GithubResponseInfo)
@@ -77,8 +94,7 @@ func processQuery(ctx context.Context, rc RequestConfig, query string,
// Forward the document metadata to the retrieval channel. // Forward the document metadata to the retrieval channel.
// This separation allows for concurrent requests for the code // This separation allows for concurrent requests for the code
// search, and the retrieval portions of the API. // search, and the retrieval portions of the API.
err := ForwardPaginatedQuery( err := gcl.ForwardPaginatedQuery(ctx, query, queryPages)
ctx, query, rc.retryCount, queryPages)
if err != nil { if err != nil {
// TODO(damienr74) handle this error with redis? // TODO(damienr74) handle this error with redis?
logger.Println(err) logger.Println(err)
@@ -87,6 +103,8 @@ func processQuery(ctx context.Context, rc RequestConfig, query string,
}() }()
errs := make(multiError, 0) errs := make(multiError, 0)
errorCnt := 0
totalCnt := 0
for page := range queryPages { for page := range queryPages {
if page.Error != nil { if page.Error != nil {
errs = append(errs, page.Error) errs = append(errs, page.Error)
@@ -101,35 +119,40 @@ func processQuery(ctx context.Context, rc RequestConfig, query string,
// search when we find a file that has been seen, or we // search when we find a file that has been seen, or we
// can choose to selectively update files. // can choose to selectively update files.
k, err := kustomizationResultAdapter(rc, file) k, err := kustomizationResultAdapter(gcl, file)
if err != nil { if err != nil {
errs = append(errs, err) errs = append(errs, err)
errorCnt++
continue continue
} }
output <- k output <- k
totalCnt++
} }
logger.Printf("got %d files out of %d from API. %d of %d had errors\n",
totalCnt, page.Parsed.TotalCount, errorCnt, totalCnt)
} }
return errs return errs
} }
func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) ( func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
*doc.KustomizationDocument, error) { *doc.KustomizationDocument, error) {
data, err := GetFileData(rc, k) data, err := gcl.GetFileData(k)
if err != nil { if err != nil {
return nil, err return nil, err
} }
creationTime, err := GetFileCreationTime(rc, k) creationTime, err := gcl.GetFileCreationTime(k)
if err != nil { if err != nil {
logger.Printf("(Error: %v) initializing to current time.", err) logger.Printf("(error: %v) initializing to current time.", err)
} }
doc := doc.KustomizationDocument{ doc := doc.KustomizationDocument{
DocumentData: string(data), DocumentData: string(data),
FilePath: doc.Atom(k.Path), FilePath: k.Path,
RepositoryURL: doc.Atom(k.Repository.URL), RepositoryURL: k.Repository.URL,
CreationTime: creationTime, CreationTime: creationTime,
} }
@@ -139,10 +162,12 @@ func kustomizationResultAdapter(rc RequestConfig, k GithubFileSpec) (
// ForwardPaginatedQuery follows the links to the next pages and performs all of // ForwardPaginatedQuery follows the links to the next pages and performs all of
// the queries for a given search query, relaying the data from each request // the queries for a given search query, relaying the data from each request
// back to an output channel. // back to an output channel.
func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64, func (gcl GitHubClient) ForwardPaginatedQuery(ctx context.Context, query string,
output chan<- GithubResponseInfo) error { output chan<- GithubResponseInfo) error {
response := parseGithubResponse(query, retryCount) logger.Println("querying: ", query)
response := gcl.parseGithubResponse(query)
if response.Error != nil { if response.Error != nil {
return response.Error return response.Error
} }
@@ -154,7 +179,7 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64,
case <-ctx.Done(): case <-ctx.Done():
return nil return nil
default: default:
response = parseGithubResponse(response.NextURL, retryCount) response = gcl.parseGithubResponse(response.NextURL)
if response.Error != nil { if response.Error != nil {
return response.Error return response.Error
} }
@@ -167,20 +192,22 @@ func ForwardPaginatedQuery(ctx context.Context, query string, retryCount uint64,
} }
// GetFileData gets the bytes from a file. // GetFileData gets the bytes from a file.
func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) { func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
url := rc.ContentsRequest(k.Repository.FullName, k.Path) url := gcl.ContentsRequest(k.Repository.FullName, k.Path)
logger.Println("content-url ", url) resp, err := gcl.GetReposData(url)
resp, err := GetReposData(url, rc.RetryCount())
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%+v: could not get '%s' metadata: %v",
k, url, err)
} }
data, err := ioutil.ReadAll(resp.Body) data, err := ioutil.ReadAll(resp.Body)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%+v: could not read '%s' metadata: %v",
k, url, err)
} }
resp.Body.Close()
type githubContentRawURL struct { type githubContentRawURL struct {
DownloadURL string `json:"download_url,omitempty"` DownloadURL string `json:"download_url,omitempty"`
@@ -188,30 +215,33 @@ func GetFileData(rc RequestConfig, k GithubFileSpec) ([]byte, error) {
var rawURL githubContentRawURL var rawURL githubContentRawURL
err = json.Unmarshal(data, &rawURL) err = json.Unmarshal(data, &rawURL)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf(
"%+v: could not get 'download_url' from '%s' response: %v",
k, data, err)
} }
logger.Println("raw-data-url", rawURL.DownloadURL) resp, err = gcl.GetRawUserContent(rawURL.DownloadURL)
resp, err = GetReposData(rawURL.DownloadURL, rc.RetryCount())
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("%+v: could not fetch file raw data '%s': %v",
k, rawURL.DownloadURL, err)
} }
defer resp.Body.Close()
return ioutil.ReadAll(resp.Body) return ioutil.ReadAll(resp.Body)
} }
// GetFileCreationTime gets the earliest date of a file. // GetFileCreationTime gets the earliest date of a file.
func GetFileCreationTime( func (gcl GitHubClient) GetFileCreationTime(
rc RequestConfig, k GithubFileSpec) (time.Time, error) { k GithubFileSpec) (time.Time, error) {
url := rc.CommitsRequest(k.Repository.FullName, k.Path) url := gcl.CommitsRequest(k.Repository.FullName, k.Path)
defaultTime := time.Now() defaultTime := time.Now()
logger.Println("commits-url", url) resp, err := gcl.GetReposData(url)
resp, err := GetReposData(url, rc.RetryCount())
if err != nil { if err != nil {
return defaultTime, err return defaultTime, fmt.Errorf(
"%+v: '%s' could not get metadata: %v", k, url, err)
} }
type DateSpec struct { type DateSpec struct {
@@ -224,18 +254,27 @@ func GetFileCreationTime(
_, lastURL := parseGithubLinkFormat(resp.Header.Get("link")) _, lastURL := parseGithubLinkFormat(resp.Header.Get("link"))
if lastURL != "" { if lastURL != "" {
resp, err = GetReposData(lastURL, rc.RetryCount()) resp, err = gcl.GetReposData(lastURL)
if err != nil { if err != nil {
return defaultTime, err return defaultTime, fmt.Errorf(
"%+v: '%s' could not get metadata: %v",
k, lastURL, err)
} }
} }
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body) data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return defaultTime, fmt.Errorf(
"%+v: failed to read metadata: %v", k, err)
}
earliestDate := []DateSpec{} earliestDate := []DateSpec{}
err = json.Unmarshal(data, &earliestDate) err = json.Unmarshal(data, &earliestDate)
size := len(earliestDate) size := len(earliestDate)
if err != nil || size == 0 { if err != nil || size == 0 {
return defaultTime, err return defaultTime, fmt.Errorf(
"%+v: server response '%s' not in expected format: %v",
k, data, err)
} }
return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date) return time.Parse(time.RFC3339, earliestDate[size-1].Commit.Author.Date)
@@ -244,6 +283,9 @@ func GetFileCreationTime(
// TODO(damienr74) change the tickers to actually check api rate limits, reset // TODO(damienr74) change the tickers to actually check api rate limits, reset
// times, and throttle requests dynamically based off of current utilization, // times, and throttle requests dynamically based off of current utilization,
// instead of hardcoding the documented values, these calls are not quota'd. // instead of hardcoding the documented values, these calls are not quota'd.
// This is now especially important, since caching the API requests will reduce
// API quota use (so we can actually make more requests in the allotted time
// period).
// //
// See https://developer.github.com/v3/rate_limit/ for details. // See https://developer.github.com/v3/rate_limit/ for details.
var ( var (
@@ -334,8 +376,8 @@ func parseGithubLinkFormat(links string) (string, string) {
return next, last return next, last
} }
func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInfo { func (gcl GitHubClient) parseGithubResponse(getRequest string) GithubResponseInfo {
resp, err := SearchGithubAPI(getRequest, retryCount) resp, err := gcl.SearchGithubAPI(getRequest)
requestInfo := GithubResponseInfo{ requestInfo := GithubResponseInfo{
Response: resp, Response: resp,
Error: err, Error: err,
@@ -347,17 +389,18 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf
} }
var data []byte var data []byte
defer resp.Body.Close()
data, requestInfo.Error = ioutil.ReadAll(resp.Body) data, requestInfo.Error = ioutil.ReadAll(resp.Body)
if requestInfo.Error != nil { if requestInfo.Error != nil {
return requestInfo return requestInfo
} }
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
logger.Println("Query: ", getRequest) logger.Println("query: ", getRequest)
logger.Println("Status not OK at the source") logger.Println("status not OK at the source")
logger.Println("Header Dump", resp.Header) logger.Println("header dump", resp.Header)
logger.Println("Body Dump", string(data)) logger.Println("body dump", string(data))
requestInfo.Error = fmt.Errorf("Request Rejected, Status '%s'", requestInfo.Error = fmt.Errorf("request rejected, status '%s'",
resp.Status) resp.Status)
return requestInfo return requestInfo
} }
@@ -382,23 +425,30 @@ func parseGithubResponse(getRequest string, retryCount uint64) GithubResponseInf
// SearchGithubAPI performs a search query and handles rate limitting for // SearchGithubAPI performs a search query and handles rate limitting for
// the 'code/search?' endpoint as well as timed retries in the case of abuse // the 'code/search?' endpoint as well as timed retries in the case of abuse
// prevention. // prevention.
func SearchGithubAPI(query string, retryCount uint64) (*http.Response, error) { func (gcl GitHubClient) SearchGithubAPI(query string) (*http.Response, error) {
throttleSearchAPI() throttleSearchAPI()
return getWithRetry(query, retryCount) return gcl.getWithRetry(query)
} }
// GetReposData performs a search query and handles rate limitting for // GetReposData performs a search query and handles rate limitting for
// the '/repos' endpoint as well as timed retries in the case of abuse // the '/repos' endpoint as well as timed retries in the case of abuse
// prevention. // prevention.
func GetReposData(query string, retryCount uint64) (*http.Response, error) { func (gcl GitHubClient) GetReposData(query string) (*http.Response, error) {
throttleRepoAPI() throttleRepoAPI()
return getWithRetry(query, retryCount) return gcl.getWithRetry(query)
} }
func getWithRetry( // User content (file contents) is not API rate limited, so there's no use in
query string, retryCount uint64) (resp *http.Response, err error) { // throttling this call.
func (gcl GitHubClient) GetRawUserContent(query string) (*http.Response, error) {
return gcl.getWithRetry(query)
}
resp, err = http.Get(query) func (gcl GitHubClient) getWithRetry(
query string) (resp *http.Response, err error) {
resp, err = gcl.client.Get(query)
retryCount := gcl.retryCount
for err == nil && for err == nil &&
resp.StatusCode == http.StatusForbidden && resp.StatusCode == http.StatusForbidden &&
@@ -407,15 +457,21 @@ func getWithRetry(
retryTime := resp.Header.Get("Retry-After") retryTime := resp.Header.Get("Retry-After")
i, err := strconv.Atoi(retryTime) i, err := strconv.Atoi(retryTime)
if err != nil { if err != nil {
return resp, fmt.Errorf("Forbidden without 'Retry-After'") return resp, fmt.Errorf(
"query '%s' forbidden without 'Retry-After'", query)
} }
logger.Printf( logger.Printf(
"Status Forbidden, retring %d more times\n", retryCount) "status forbidden, retring %d more times\n", retryCount)
logger.Printf("Waiting %d seconds before retrying\n", i) logger.Printf("waiting %d seconds before retrying\n", i)
time.Sleep(time.Second * time.Duration(i)) time.Sleep(time.Second * time.Duration(i))
retryCount-- retryCount--
resp, err = http.Get(query) resp, err = gcl.client.Get(query)
}
if err != nil {
return resp, fmt.Errorf("query '%s' could not be processed, %v",
query, err)
} }
return resp, err return resp, err

View File

@@ -99,16 +99,12 @@ func Path(p string) queryField {
// determine the date of a file. // determine the date of a file.
type RequestConfig struct { type RequestConfig struct {
perPage uint64 perPage uint64
retryCount uint64
accessToken string accessToken string
} }
func NewRequestConfig( func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
perPage, retryCount uint64, accessToken string) RequestConfig {
return RequestConfig{ return RequestConfig{
perPage: perPage, perPage: perPage,
retryCount: retryCount,
accessToken: accessToken, accessToken: accessToken,
} }
} }
@@ -138,12 +134,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
return rc.makeRequest(uri, Query{Path(path)}).URL() return rc.makeRequest(uri, Query{Path(path)}).URL()
} }
// How many times to retry the queries before giving up (used by the crawler,
// not Github).
func (rc RequestConfig) RetryCount() uint64 {
return rc.retryCount
}
func (rc RequestConfig) makeRequest(path string, query Query) request { func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{} vals := url.Values{}
if rc.accessToken != "" { if rc.accessToken != "" {

View File

@@ -43,17 +43,17 @@ type cachedSearch interface {
// over cached values. // over cached values.
type githubCachedSearch struct { type githubCachedSearch struct {
cache map[uint64]uint64 cache map[uint64]uint64
retryCount uint64 gcl GitHubClient
baseRequest request baseRequest request
} }
func newCache(rc RequestConfig, query Query) githubCachedSearch { func newCache(client GitHubClient, query Query) githubCachedSearch {
return githubCachedSearch{ return githubCachedSearch{
cache: map[uint64]uint64{ cache: map[uint64]uint64{
0: 0, 0: 0,
}, },
retryCount: rc.RetryCount(), gcl: client,
baseRequest: rc.CodeSearchRequestWith(query), baseRequest: client.CodeSearchRequestWith(query),
} }
} }
@@ -66,7 +66,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
sizeRange := RangeWithin{0, upperBound} sizeRange := RangeWithin{0, upperBound}
rangeRequest := c.RequestString(sizeRange) rangeRequest := c.RequestString(sizeRange)
result := parseGithubResponse(rangeRequest, c.retryCount) result := c.gcl.parseGithubResponse(rangeRequest)
if result.Error != nil { if result.Error != nil {
return count, result.Error return count, result.Error
} }
@@ -100,7 +100,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) {
"Retrying query... current lower bound: %d, got: %d\n", "Retrying query... current lower bound: %d, got: %d\n",
c.cache[prev], result.Parsed.TotalCount) c.cache[prev], result.Parsed.TotalCount)
result = parseGithubResponse(rangeRequest, c.retryCount) result = c.gcl.parseGithubResponse(rangeRequest)
if result.Error != nil { if result.Error != nil {
return count, result.Error return count, result.Error
} }

View File

@@ -3,6 +3,8 @@ module sigs.k8s.io/kustomize/internal/search
go 1.12 go 1.12
require ( require (
github.com/gomodule/redigo v2.0.0+incompatible
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79
gopkg.in/yaml.v2 v2.2.2 // indirect gopkg.in/yaml.v2 v2.2.2 // indirect
sigs.k8s.io/yaml v1.1.0 sigs.k8s.io/yaml v1.1.0
) )

View File

@@ -1,3 +1,7 @@
github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0=
github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=

View File

@@ -0,0 +1,19 @@
package httpclient
import (
"net/http"
"time"
"github.com/gomodule/redigo/redis"
"github.com/gregjones/httpcache"
redis_cache "github.com/gregjones/httpcache/redis"
)
func NewClient(conn redis.Conn) *http.Client {
etagCache := redis_cache.NewWithClient(conn)
tr := httpcache.NewTransport(etagCache)
return &http.Client{
Transport: tr,
Timeout: 10 * time.Second,
}
}