package github import ( "fmt" "math/bits" ) // Files cannot be more than 2^19 bytes, according to // https://help.github.com/en/articles/searching-code#considerations-for-code-search const ( githubMaxFileSize = uint64(1 << 19) githubMaxResultsPerQuery = uint64(1000) ) // Interface for testing purposes. Not expecting to have multiple // implementations. type cachedSearch interface { CountResults(uint64) (uint64, error) RequestString(filesize rangeFormatter) string } // Cache uses bit tricks to be more efficient in detecting // inconsistencies in the returned data from the Github API. // Therefore, the cache expects a search to always start at 0, and // it expects the max file size to be a power of 2. If this is to be changed // there are a few considerations to keep in mind: // // 1. The cache is only efficient if the queries can be reused, so if // the first chunk of files lives in the range 0..x, continuing the // search for the next chunk from x+1..max (while asymptotically sane) // may actually be less efficient since the cache is essentially reset // at every interval. This leads to a larger number of requests in // practice, and requests are what's expensive (rate limits). // // 2. The github API is not perfectly monotonic.. (this is somewhat // problematic). The current cache implementation looks at the // predecessor entry to find out if the current value is monotonic. // This is where the bit trick is used, since each step in the binary // search is adding or ommiting to add a decreasing of 2 to the query value, // we can remove the least significant set bit to find the predecessor in // constant time. Ultimately since the search is rate limited, we could also // easily afford to compute this in linear time by iterating // over cached values. type githubCachedSearch struct { cache map[uint64]uint64 gcl GitHubClient baseRequest request } func newCache(client GitHubClient, query Query) githubCachedSearch { return githubCachedSearch{ cache: map[uint64]uint64{ 0: 0, }, gcl: client, baseRequest: client.CodeSearchRequestWith(query), } } func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { count, cached := c.cache[upperBound] if cached { return count, nil } sizeRange := RangeWithin{0, upperBound} rangeRequest := c.RequestString(sizeRange) result := c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } // As range search uses powers of 2 for binary search, the previously // cached value is easy to find by removing the least significant set // bit from the current upperBound, since each step of the search adds // least significant set bit. // // Finding the predecessor could also be implemented by iterating over // the map to find the largest key that is smaller than upperBound if // this approach deemed too complex. trail := bits.TrailingZeros64(upperBound) prev := uint64(0) if trail != 64 { prev = upperBound - (1 << uint64(trail)) } // Sometimes the github API is not monotonically increasing, or ouputs // an erroneous value of 0, or 1. This logic makes sure that it was not // erroneous, and that the sequence continues to be monotonic by setting // the current query count to match the previous value. which at least // guarantees that the range search terminates. // // On the other hand, if files are added, then we way loose out on some // files in a reviously completed range, but these files should be there // the next time the crawler runs, so this is not really problematic. retryMonotonicCount := 4 for result.Parsed.TotalCount < c.cache[prev] { logger.Printf( "Retrying query... current lower bound: %d, got: %d\n", c.cache[prev], result.Parsed.TotalCount) result = c.gcl.parseGithubResponse(rangeRequest) if result.Error != nil { return count, result.Error } retryMonotonicCount-- if retryMonotonicCount <= 0 { result.Parsed.TotalCount = c.cache[prev] logger.Println( "Retries for monotonic check exceeded,", " setting value to match predecessor") } } count = result.Parsed.TotalCount logger.Printf("Caching new query %s, with count %d\n", sizeRange.RangeString(), count) c.cache[upperBound] = count return count, nil } func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { return c.baseRequest.CopyWith(Filesize(filesize)).URL() } // Outputs a (possibly incomplete) list of ranges to query to find most search // results as permissible by the search github search API. Github search only // allows 1,000 results per query (paginated). // Source: https://developer.github.com/v3/search/ // // This leaves the possibility of having file sizes with more than 1000 results, // This would mean that the search as it is could not find all files. If queries // are sorted by last indexed, and retrieved on regular intervals, it should be // sufficient to get most if not all documents. func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { totalFiles, err := cache.CountResults(githubMaxFileSize) if err != nil { return nil, err } logger.Println("total files: ", totalFiles) if githubMaxResultsPerQuery >= totalFiles { return []string{ cache.RequestString(RangeWithin{0, githubMaxFileSize}), }, nil } // Find all the ranges of file sizes such that all files are queryable // using the Github API. This does not compute an optimal ranges, since // the number of queries needed to get the information required to // compute an optimal range is expected to be much larger than the // number of queries performed this way. // // The number of ranges is k = (number of files)/1000, and finding a // range is logarithmic in the max file size (n = filesize). This means // that preprocessing takes O(k * lg n) queries to find the ranges with // a binary search over file sizes. // // My intuition is that this approach is competitive to a perfectly // optimal solution, but I didn't actually take the time to do a // rigurous proof. Intuitively, since files sizes are typically power // law distibuted the binary search will be very skewed towards the // smaller file ranges. This means that in practice this approach will // make fewer than (#files/1000)*(log(n) = 19) queries for // preprocessing, since it reuses a lot of the queries in the denser // ranges. Furthermore, because of the distribution, it should be very // easy to find ranges that are very close to the upper bound, up to // the limiting factor of having no more than 1000 files accessible per // range. filesAccessible := uint64(0) sizes := make([]uint64, 0) for filesAccessible < totalFiles { target := filesAccessible + githubMaxResultsPerQuery if target >= totalFiles { break } logger.Printf("%d accessible files, next target = %d\n", filesAccessible, target) cur, err := lowerBoundFileCount(cache, target) if err != nil { return nil, err } // If there are more than 1000 files in the next bucket, we must // advance anyway and lose out on some files :(. if l := len(sizes); l > 0 && sizes[l-1] == cur { cur++ } nextAccessible, err := cache.CountResults(cur) if err != nil { return nil, fmt.Errorf( "cache should be populated at %d already, got %v", cur, err) } if nextAccessible < filesAccessible { return nil, fmt.Errorf( "Number of results dropped from %d to %d within range search", filesAccessible, nextAccessible) } filesAccessible = nextAccessible if nextAccessible < totalFiles { sizes = append(sizes, cur) } } return formatFilesizeRanges(cache, sizes), nil } // lowerBoundFileCount finds the filesize range from [0, return value] that has // the largest file count that is smaller than or equal to // githubMaxResultsPerQuery. It is important to note that this returned value // could already be in a previous range if the next file size has more than 1000 // results. It is left to the caller to handle this bit of logic and guarantee // forward progession in this case. func lowerBoundFileCount( cache cachedSearch, targetFileCount uint64) (uint64, error) { // Binary search for file sizes that make up the next <=1000 element // chunk. cur := uint64(0) increase := githubMaxFileSize / 2 for increase > 0 { mid := cur + increase count, err := cache.CountResults(mid) if err != nil { return count, err } if count <= targetFileCount { cur = mid } if count == targetFileCount { break } increase /= 2 } return cur, nil } func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { ranges := make([]string, 0, len(sizes)+1) if len(sizes) > 0 { ranges = append(ranges, cache.RequestString( RangeLessThan{sizes[0] + 1}, )) } for i := 0; i < len(sizes)-1; i += 1 { ranges = append(ranges, cache.RequestString( RangeWithin{sizes[i] + 1, sizes[i+1]}, )) if i != len(sizes)-2 { continue } ranges = append(ranges, cache.RequestString( RangeGreaterThan{sizes[i+1]}, )) } return ranges }