Crawler performance improvements, better structure

Refactored the crawler implementation to make the whole thing more
testable. Added a document interface to make the crawler generic.
This will be useful for collecting plugins, and other documents.
This commit is contained in:
Damien Robichaud
2019-08-23 18:11:55 -07:00
parent a66808a10d
commit c2d6f09ef3
13 changed files with 1251 additions and 99 deletions

View File

@@ -16,7 +16,11 @@ import (
"strings"
"time"
"sigs.k8s.io/kustomize/internal/tools/crawler"
"sigs.k8s.io/kustomize/internal/tools/doc"
"sigs.k8s.io/kustomize/internal/tools/httpclient"
"sigs.k8s.io/kustomize/v3/pkg/git"
"sigs.k8s.io/kustomize/v3/pkg/pgmconfig"
)
var logger = log.New(os.Stdout, "Github Crawler: ",
@@ -34,6 +38,17 @@ type GitHubClient struct {
client *http.Client
}
func NewClient(accessToken string, retryCount uint64, client *http.Client) GitHubClient {
return GitHubClient{
retryCount: retryCount,
client: client,
RequestConfig: RequestConfig{
perPage: githubMaxPageSize,
accessToken: accessToken,
},
}
}
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler {
@@ -52,7 +67,7 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
// Implements crawler.Crawler.
func (gc githubCrawler) Crawl(
ctx context.Context, output chan<- *doc.KustomizationDocument) error {
ctx context.Context, output chan<- crawler.CrawlerDocument) error {
noETagClient := GitHubClient{
RequestConfig: gc.client.RequestConfig,
@@ -80,13 +95,78 @@ func (gc githubCrawler) Crawl(
}
}
return errs
if len(errs) > 0 {
return errs
}
return nil
}
func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) error {
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
if err != nil {
return fmt.Errorf("invalid repospec: %v", err)
}
url := "https://raw.githubusercontent.com/" + repoSpec.OrgRepo +
"/" + repoSpec.Ref + "/" + repoSpec.Path
handle := func(resp *http.Response, err error, path string) error {
if err == nil && resp.StatusCode == http.StatusOK {
d.IsSame = httpclient.FromCache(resp.Header)
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}
d.DocumentData = string(data)
d.FilePath = d.FilePath + path
return nil
}
return err
}
resp, err := gc.client.GetRawUserContent(url)
if err := handle(resp, err, ""); err == nil {
return nil
}
for _, file := range pgmconfig.KustomizationFileNames {
resp, err = gc.client.GetRawUserContent(url + "/" + file)
err := handle(resp, err, "/"+file)
if err != nil {
continue
}
}
return fmt.Errorf("File Not Found: %s", url)
}
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
fs := GithubFileSpec{}
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
creationTime, err := gc.client.GetFileCreationTime(fs)
if err != nil {
return err
}
d.CreationTime = &creationTime
return nil
}
func (gc githubCrawler) Match(d *doc.Document) bool {
url := d.RepositoryURL + "/" + d.FilePath + "?ref=" + "/" +
d.DefaultBranch
repoSpec, err := git.NewRepoSpecFromUrl(url)
if err != nil {
return false
}
return strings.Contains(repoSpec.Host, "github.com")
}
// processQuery follows all of the pages in a query, and updates/adds the
// documents from the crawl to the datastore/index.
func processQuery(ctx context.Context, gcl GitHubClient, query string,
output chan<- *doc.KustomizationDocument) error {
output chan<- crawler.CrawlerDocument) error {
queryPages := make(chan GithubResponseInfo)
@@ -112,13 +192,6 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
}
for _, file := range page.Parsed.Items {
// TODO(damienr74) This is where we'd need to
// communicate with redis. Currently always doing a full
// reindex of the documents. Since the documents are in
// sorted order in each bucket, we can short circuit the
// search when we find a file that has been seen, or we
// can choose to selectively update files.
k, err := kustomizationResultAdapter(gcl, file)
if err != nil {
errs = append(errs, err)
@@ -137,23 +210,33 @@ func processQuery(ctx context.Context, gcl GitHubClient, query string,
}
func kustomizationResultAdapter(gcl GitHubClient, k GithubFileSpec) (
*doc.KustomizationDocument, error) {
crawler.CrawlerDocument, error) {
data, err := gcl.GetFileData(k)
if err != nil {
return nil, err
}
creationTime, err := gcl.GetFileCreationTime(k)
if err != nil {
logger.Printf("(error: %v) initializing to current time.", err)
logger.Printf(
"(error: %v) initializing to current time.\n", err)
}
url := gcl.ReposRequest(k.Repository.FullName)
defaultBranch, err := gcl.GetDefaultBranch(url)
if err != nil {
logger.Printf(
"(error: %v) setting default_branch to master\n", err)
defaultBranch = "master"
}
doc := doc.KustomizationDocument{
DocumentData: string(data),
FilePath: k.Path,
RepositoryURL: k.Repository.URL,
CreationTime: creationTime,
Document: doc.Document{
DocumentData: string(data),
FilePath: k.Path,
DefaultBranch: defaultBranch,
RepositoryURL: k.Repository.URL,
},
}
return &doc, nil
@@ -227,7 +310,34 @@ func (gcl GitHubClient) GetFileData(k GithubFileSpec) ([]byte, error) {
}
defer resp.Body.Close()
return ioutil.ReadAll(resp.Body)
data, err = ioutil.ReadAll(resp.Body)
return data, err
}
func (gcl GitHubClient) GetDefaultBranch(url string) (string, error) {
resp, err := gcl.GetReposData(url)
if err != nil {
return "", fmt.Errorf(
"'%s' could not get default_branch: %v", url, err)
}
defer resp.Body.Close()
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf(
"could not read default_branch: %v", err)
}
type defaultBranch struct {
DefaultBranch string `json:"default_branch,omitempty"`
}
var branch defaultBranch
err = json.Unmarshal(data, &branch)
if err != nil {
return "", fmt.Errorf(
"default_branch json malformed: %v", err)
}
return branch.DefaultBranch, nil
}
// GetFileCreationTime gets the earliest date of a file.
@@ -301,29 +411,23 @@ func throttleRepoAPI() {
<-contentRateTicker.C
}
const (
accessTokenKeyword = "access_token="
perPageKeyword = "per_page="
contentSearchURL = "https://api.github.com/repos"
contentKeyword = "contents"
)
type multiError []error
func (me multiError) Error() string {
size := len(me) + 2
strs := make([]string, size)
strs[0] = "Errors [\n\t"
strs[0] = "Errors ["
for i, err := range me {
strs[i+1] = err.Error()
strs[i+1] = "\t" + err.Error()
}
strs[size-1] = "\n]"
return strings.Join(strs, "\n\t")
strs[size-1] = "]"
return strings.Join(strs, "\n")
}
type GithubFileSpec struct {
Path string `json:"path,omitempty"`
Repository struct {
API string `json:"url,omitempty"`
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
} `json:"repository,omitempty"`

View File

@@ -127,6 +127,11 @@ func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
return rc.makeRequest(uri, Query{}).URL()
}
func (rc RequestConfig) ReposRequest(fullRepoName string) string {
uri := fmt.Sprintf("repos/%s", fullRepoName)
return rc.makeRequest(uri, Query{}).URL()
}
// CommitsRequest given the repo name, and a filepath returns a formatted query
// for the Github API to find the commits that affect this file.
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {

View File

@@ -1,5 +1,102 @@
package github
// GitHub only returns at most 1000 results per search query,
// this is problematic if you want to retrieve all the results for a given
// search query. However, GitHub allows you to specify as much as you want per
// query to make things more specific. Specifically for files, GitHub allows
// you to specify their sizes with range queries. This is very convenient
// since it allows us to split the search into disjoint sets/shards of results
// from the different file size ranges.
//
// Some important factors to consider:
//
// - These queries are rate limited by the API to roughly once query every two
// seconds.
//
// - The search space for file sizes is in bytes, from 0B to < 512KiB (this is
// a huge search space that cannot be probed linearly in a timely manner if
// granularity is to be expected).
//
// - If you have K files there will likely be ~K/1000 sets that you have find
// from this search space in order to get all of the results.
//
// - If you have O(K) sets it is unlikely that they are all of the same size,
// since (most files are power law distributed). That means that the range
// might be significantly smaller for 1000 small files, than it is for
// 1000 large files.
//
// - This method is a best effort approach. There are some limitations to what
// it can and can't do, so please note the following:
//
// + There may very well be a filesize that has more than 1000 results.
// this method cannot help in this case. However, requerying over time
// (days/weeks/months) while sorting by last indexed values may be
// sufficient to eventually get all of the results.
//
// + It's possible that the github API returns inconsistent counts. This
// is problematic in most cases, since it can cause many issues if the
// case is not handled properly. For instance, if you requested the
// number of files of an interval from size:0..64 and get that there
// are 900 results, you may query at size:0..96 and get that there
// are 800 results. To guarantee that this approach completes and does
// not get into a query loop over the same intervals, it will retry a few
// times and take the largest of the results or the largest previously
// queried value from another range (in this case, the implementation
// could decide that size:0..96 must have 900) results. This makes the
// approach best effort even if there are no single file sizes of over
// 1000 results.
//
//
// The approach that was taken to solve this problem is the following:
//
// 1. Determine the total number of results by querying from the lower bound
// to the upper bound (size:0..max). If there are less than 1000 files,
// return a single range of values (size:0..max) since all results can be
// retrieved.
//
// 2. Otherwise, set a target number of files to be 1000.
//
// 3. Binary search for the range from 0..r that provides a file count that is
// less than or equal to the target. Once this value is found, store the
// upper bound of range (r). If r is the same as the previous value, (or 0)
// increase r by one (this guarantees progress, but will miss out on some
// results).
//
// 4. Increase the target by 1000.
//
// 5. Repeat steps 3 and 4 until the target is at or exceeds the total number
// of files.
//
//
// In general there are other ways to get all of the files from GitHub. In
// some cases it would be sufficient to just get the files that are being
// updated/indexed by github periodically to update the corpus, so this
// complicated approach does not have to be run every time. However, for
// some searches, there may be too many results on a time interval to do
// this simple update search limited to only 1000 results.
//
// There is also a more sophisticated approach that may yield better
// performance:
// - Perform this search once and create a prior distribution of file sizes.
// Each time you want to retrieve the results of the query, scale the
// prior of expected ranges to the current number of files. From each
// expected range of 1000 files, perform a exponential search to find the
// lower bound of the range. This would likely reduce the total number
// of queries by a significant amount since it would only have to search
// for a small set of values around each likely range boundary.
//
// However, actually retrieving the files will be the bottleneck operation
// since the number of queries to find the ranges will be close to:
// log2(maxFileSize) * totalResults / 1000 ~= totalResults / 50
// whereas the number of queries to actually get all of the search results
// are close to:
// apiCallsPerResult * 10(pages) * 100(resultsPerPage) * totalResults / 1000
// = apiCallsPerResult * totalResults.
//
// So it could very well take apiCallsPerResult * 50 times longer to acutally
// fetch the results (assuming the quotas for the API calls are the same as the
// search API), than it does to perform these range searches.
import (
"fmt"
"math/bits"
@@ -12,14 +109,20 @@ const (
githubMaxResultsPerQuery = uint64(1000)
)
// Interface for testing purposes. Not expecting to have multiple
// implementations.
// Interface instead of struct for testing purposes.
// Not expecting to have multiple implementations.
type cachedSearch interface {
CountResults(uint64) (uint64, error)
RequestString(filesize rangeFormatter) string
}
// Cache uses bit tricks to be more efficient in detecting
// cachedSearch is a simple data structure that maps the upper bound (r) of a
// range from 0 to r to the number of files that have between 0 and r files
// (inclusive). It also guarantees that the counts are monotonically increasing
// (not strict) as the value for r increases, by looking at the maximal
// previous file count for the value that precedes r in the cache.
//
// It uses a bit trick to be more efficient in detecting
// inconsistencies in the returned data from the Github API.
// Therefore, the cache expects a search to always start at 0, and
// it expects the max file size to be a power of 2. If this is to be changed
@@ -36,11 +139,12 @@ type cachedSearch interface {
// problematic). The current cache implementation looks at the
// predecessor entry to find out if the current value is monotonic.
// This is where the bit trick is used, since each step in the binary
// search is adding or ommiting to add a decreasing of 2 to the query value,
// we can remove the least significant set bit to find the predecessor in
// constant time. Ultimately since the search is rate limited, we could also
// easily afford to compute this in linear time by iterating
// over cached values.
// search is adding or ommiting to add a decreasing power of 2 to the query
// value, we can remove the least significant set bit to find the
// predecessor in constant time. Ultimately since the search is rate
// limited, we could also easily afford to compute this in linear time
// by iterating over cached values. So this trick is not crucial to the
// cache's performance.
type githubCachedSearch struct {
cache map[uint64]uint64
gcl GitHubClient
@@ -160,7 +264,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
//
// My intuition is that this approach is competitive to a perfectly
// optimal solution, but I didn't actually take the time to do a
// rigurous proof. Intuitively, since files sizes are typically power
// rigorous proof. Intuitively, since files sizes are typically power
// law distibuted the binary search will be very skewed towards the
// smaller file ranges. This means that in practice this approach will
// make fewer than (#files/1000)*(log(n) = 19) queries for