diff --git a/internal/search/crawler/github/crawler.go b/internal/search/crawler/github/crawler.go index 2c0ea9ed9..8c7c81f57 100644 --- a/internal/search/crawler/github/crawler.go +++ b/internal/search/crawler/github/crawler.go @@ -45,13 +45,14 @@ func NewCrawler( func (gc githubCrawler) Crawl( ctx context.Context, output chan<- *doc.KustomizationDocument) error { - // Range finding will be added in the next PR to make this one - // simpler/shorter. It would return multiple search queries for the - // Github API such that all documents can be retrieved. This is - // required since Github returns a max of 1000 results per query, so - // multiple queries that split the search space into chunks of 1000 - // kustomization files is required. - ranges := []string{gc.query.String()} + // Since Github returns a max of 1000 results per query, we can use + // multiple queries that split the search space into chunks of at most + // 1000 files to get all of the data. + ranges, err := FindRangesForRepoSearch(newCache(gc.rc, gc.query)) + if err != nil { + return fmt.Errorf("could not split search into ranges, %v\n", + err) + } // Query each range for files. errs := make(multiError, 0) diff --git a/internal/search/crawler/github/split_search_ranges.go b/internal/search/crawler/github/split_search_ranges.go new file mode 100644 index 000000000..c098fa6ac --- /dev/null +++ b/internal/search/crawler/github/split_search_ranges.go @@ -0,0 +1,274 @@ +package github + +import ( + "fmt" + "math/bits" +) + +// Files cannot be more than 2^19 bytes, according to +// https://help.github.com/en/articles/searching-code#considerations-for-code-search +const ( + githubMaxFileSize = uint64(1 << 19) + githubMaxResultsPerQuery = uint64(1000) +) + +// Interface for testing purposes. Not expecting to have multiple +// implementations. +type cachedSearch interface { + CountResults(uint64) (uint64, error) + RequestString(filesize rangeFormatter) string +} + +// Cache uses bit tricks to be more efficient in detecting +// inconsistencies in the returned data from the Github API. +// Therefore, the cache expects a search to always start at 0, and +// it expects the max file size to be a power of 2. If this is to be changed +// there are a few considerations to keep in mind: +// +// 1. The cache is only efficient if the queries can be reused, so if +// the first chunk of files lives in the range 0..x, continuing the +// search for the next chunk from x+1..max (while asymptotically sane) +// may actually be less efficient since the cache is essentially reset +// at every interval. This leads to a larger number of requests in +// practice, and requests are what's expensive (rate limits). +// +// 2. The github API is not perfectly monotonic.. (this is somewhat +// problematic). The current cache implementation looks at the +// predecessor entry to find out if the current value is monotonic. +// This is where the bit trick is used, since each step in the binary +// search is adding or ommiting to add a decreasing of 2 to the query value, +// we can remove the least significant set bit to find the predecessor in +// constant time. Ultimately since the search is rate limited, we could also +// easily afford to compute this in linear time by iterating +// over cached values. +type githubCachedSearch struct { + cache map[uint64]uint64 + retryCount uint64 + baseRequest request +} + +func newCache(rc RequestConfig, query Query) githubCachedSearch { + return githubCachedSearch{ + cache: map[uint64]uint64{ + 0: 0, + }, + retryCount: rc.RetryCount(), + baseRequest: rc.CodeSearchRequestWith(query), + } +} + +func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { + count, cached := c.cache[upperBound] + if cached { + return count, nil + } + + sizeRange := RangeWithin{0, upperBound} + rangeRequest := c.RequestString(sizeRange) + + result := parseGithubResponse(rangeRequest, c.retryCount) + if result.Error != nil { + return count, result.Error + } + + // As range search uses powers of 2 for binary search, the previously + // cached value is easy to find by removing the least significant set + // bit from the current upperBound, since each step of the search adds + // least significant set bit. + // + // Finding the predecessor could also be implemented by iterating over + // the map to find the largest key that is smaller than upperBound if + // this approach deemed too complex. + trail := bits.TrailingZeros64(upperBound) + prev := uint64(0) + if trail != 64 { + prev = upperBound - (1 << uint64(trail)) + } + + // Sometimes the github API is not monotonically increasing, or ouputs + // an erroneous value of 0, or 1. This logic makes sure that it was not + // erroneous, and that the sequence continues to be monotonic by setting + // the current query count to match the previous value. which at least + // guarantees that the range search terminates. + // + // On the other hand, if files are added, then we way loose out on some + // files in a reviously completed range, but these files should be there + // the next time the crawler runs, so this is not really problematic. + retryMonotonicCount := 4 + for result.Parsed.TotalCount < c.cache[prev] { + logger.Printf( + "Retrying query... current lower bound: %d, got: %d\n", + c.cache[prev], result.Parsed.TotalCount) + + result = parseGithubResponse(rangeRequest, c.retryCount) + if result.Error != nil { + return count, result.Error + } + + retryMonotonicCount-- + if retryMonotonicCount <= 0 { + result.Parsed.TotalCount = c.cache[prev] + logger.Println( + "Retries for monotonic check exceeded,", + " setting value to match predecessor") + } + } + + count = result.Parsed.TotalCount + logger.Printf("Caching new query %s, with count %d\n", + sizeRange.RangeString(), count) + c.cache[upperBound] = count + return count, nil +} + +func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { + return c.baseRequest.CopyWith(Filesize(filesize)).URL() +} + +// Outputs a (possibly incomplete) list of ranges to query to find most search +// results as permissible by the search github search API. Github search only +// allows 1,000 results per query (paginated). +// Source: https://developer.github.com/v3/search/ +// +// This leaves the possibility of having file sizes with more than 1000 results, +// This would mean that the search as it is could not find all files. If queries +// are sorted by last indexed, and retrieved on regular intervals, it should be +// sufficient to get most if not all documents. +func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { + totalFiles, err := cache.CountResults(githubMaxFileSize) + if err != nil { + return nil, err + } + logger.Println("total files: ", totalFiles) + + if githubMaxResultsPerQuery >= totalFiles { + return []string{ + cache.RequestString(RangeWithin{0, githubMaxFileSize}), + }, nil + } + + // Find all the ranges of file sizes such that all files are queryable + // using the Github API. This does not compute an optimal ranges, since + // the number of queries needed to get the information required to + // compute an optimal range is expected to be much larger than the + // number of queries performed this way. + // + // The number of ranges is k = (number of files)/1000, and finding a + // range is logarithmic in the max file size (n = filesize). This means + // that preprocessing takes O(k * lg n) queries to find the ranges with + // a binary search over file sizes. + // + // My intuition is that this approach is competitive to a perfectly + // optimal solution, but I didn't actually take the time to do a + // rigurous proof. Intuitively, since files sizes are typically power + // law distibuted the binary search will be very skewed towards the + // smaller file ranges. This means that in practice this approach will + // make fewer than (#files/1000)*(log(n) = 19) queries for + // preprocessing, since it reuses a lot of the queries in the denser + // ranges. Furthermore, because of the distribution, it should be very + // easy to find ranges that are very close to the upper bound, up to + // the limiting factor of having no more than 1000 files accessible per + // range. + filesAccessible := uint64(0) + sizes := make([]uint64, 0) + for filesAccessible < totalFiles { + target := filesAccessible + githubMaxResultsPerQuery + if target >= totalFiles { + break + } + + logger.Printf("%d accessible files, next target = %d\n", + filesAccessible, target) + + cur, err := lowerBoundFileCount(cache, target) + if err != nil { + return nil, err + } + + // If there are more than 1000 files in the next bucket, we must + // advance anyway and lose out on some files :(. + if l := len(sizes); l > 0 && sizes[l-1] == cur { + cur++ + } + + nextAccessible, err := cache.CountResults(cur) + if err != nil { + return nil, fmt.Errorf( + "cache should be populated at %d already, got %v", + cur, err) + } + if nextAccessible < filesAccessible { + return nil, fmt.Errorf( + "Number of results dropped from %d to %d within range search", + filesAccessible, nextAccessible) + } + + filesAccessible = nextAccessible + if nextAccessible < totalFiles { + sizes = append(sizes, cur) + } + } + + return formatFilesizeRanges(cache, sizes), nil +} + +// lowerBoundFileCount finds the filesize range from [0, return value] that has +// the largest file count that is smaller than or equal to +// githubMaxResultsPerQuery. It is important to note that this returned value +// could already be in a previous range if the next file size has more than 1000 +// results. It is left to the caller to handle this bit of logic and guarantee +// forward progession in this case. +func lowerBoundFileCount( + cache cachedSearch, targetFileCount uint64) (uint64, error) { + + // Binary search for file sizes that make up the next <=1000 element + // chunk. + cur := uint64(0) + increase := githubMaxFileSize / 2 + + for increase > 0 { + mid := cur + increase + + count, err := cache.CountResults(mid) + if err != nil { + return count, err + } + + if count <= targetFileCount { + cur = mid + } + + if count == targetFileCount { + break + } + + increase /= 2 + } + + return cur, nil +} + +func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { + ranges := make([]string, 0, len(sizes)+1) + + if len(sizes) > 0 { + ranges = append(ranges, cache.RequestString( + RangeLessThan{sizes[0] + 1}, + )) + } + + for i := 0; i < len(sizes)-1; i += 1 { + ranges = append(ranges, cache.RequestString( + RangeWithin{sizes[i] + 1, sizes[i+1]}, + )) + + if i != len(sizes)-2 { + continue + } + ranges = append(ranges, cache.RequestString( + RangeGreaterThan{sizes[i+1]}, + )) + } + + return ranges +} diff --git a/internal/search/crawler/github/split_search_ranges_test.go b/internal/search/crawler/github/split_search_ranges_test.go new file mode 100644 index 000000000..c175486e6 --- /dev/null +++ b/internal/search/crawler/github/split_search_ranges_test.go @@ -0,0 +1,90 @@ +package github + +import ( + "fmt" + "reflect" + "testing" +) + +type testCachedSearch struct { + cache map[uint64]uint64 +} + +func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { + fmt.Printf("CountResults(%05x)\n", upperBound) + count, ok := c.cache[upperBound] + if !ok { + return count, fmt.Errorf("cache not set at %x", upperBound) + } + return count, nil +} + +func (c testCachedSearch) RequestString(filesize rangeFormatter) string { + return filesize.RangeString() +} + +// TODO(damienr74) make tests easier to write.. I'm thinking I can make the test +// cache take in a list of (filesize, count) pairs and it can populate the cache +// without relying on how the implementation will create queries. This was only +// a quick and dirty test to make sure that modifications are not going to break +// the functionality. +func TestRangeSplitting(t *testing.T) { + // Keys follow the binary search depending on whether or not the range + // is too small/large to find close to optimal filesize ranges. This + // test is heavily tied to the fact that the search is using powers of two + // to make progress in the search (hence the use of hexadecimal values). + cache := testCachedSearch{ + map[uint64]uint64{ + 0x80000: 5000, + 0x40000: 5000, + 0x20000: 5000, + 0x10000: 5000, + 0x08000: 5000, + 0x04000: 5000, + 0x02000: 5000, + 0x01000: 5000, + 0x00fff: 3950, + 0x00ffe: 3950, + 0x00ffc: 3950, + 0x00ff8: 3950, + 0x00ff0: 3950, + 0x00fe0: 3950, + 0x00fc0: 3950, + 0x00f80: 3950, + 0x00f00: 3950, + 0x00e00: 3950, + 0x00c00: 3950, + 0x00800: 3950, + 0x00400: 3950, + 0x00200: 3688, + 0x00180: 3028, + 0x00100: 2999, + 0x000c0: 2448, + 0x00080: 1999, + 0x00070: 1600, + 0x0006c: 1003, + 0x0006b: 1001, + 0x0006a: 999, + 0x00068: 999, + 0x00060: 999, + 0x00040: 999, + 0x00000: 0, + }, + } + + requests, err := FindRangesForRepoSearch(cache) + if err != nil { + t.Errorf("Error while finding ranges: %v", err) + } + expected := []string{ + "<107", // cache.RequestString(RangeLessThan{0x6b}), + "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), + "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), + "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), + ">4095", // cache.RequestString(RangeGreaterThan{0xfff}), + } + + if !reflect.DeepEqual(requests, expected) { + t.Errorf("Expected requests (%v) to equal (%v)", requests, expected) + } +}