From 7a87c84403d98c88c735f2d46f2a96584e36f0fe Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Wed, 29 Jan 2020 11:34:41 -0800 Subject: [PATCH] Reprocess the github filesize search ranges which have more than 1000 items --- api/internal/crawl/crawler/github/crawler.go | 56 ++++++++++++-- .../crawler/github/split_search_ranges.go | 75 ++++++++++--------- .../github/split_search_ranges_test.go | 24 ++++-- 3 files changed, 104 insertions(+), 51 deletions(-) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 21cfd6f81..23bcbcd72 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -80,6 +80,36 @@ func (gc githubCrawler) DefaultBranch(repo string) string { func (gc githubCrawler) Crawl(ctx context.Context, output chan<- crawler.CrawledDocument, seen utils.SeenMap) error { + ranges := []RangeWithin{ + RangeWithin{ + start: uint64(0), + end: githubMaxFileSize, + }, + } + + errs := make(multiError, 0) + for len(ranges) > 0 { + tailRange := ranges[len(ranges) - 1] + ranges = ranges[:(len(ranges) - 1)] + reProcessQueryRanges, err := gc.CrawlSingleRange(ctx, output, seen, tailRange.start, tailRange.end) + if err != nil { + errs = append(errs, err) + } + ranges = append(ranges, reProcessQueryRanges...) + } + + if len(errs) > 0 { + return errs + } + return nil +} + +func (gc githubCrawler) CrawlSingleRange(ctx context.Context, + output chan<- crawler.CrawledDocument, seen utils.SeenMap, + lowerBound, upperBound uint64) ([]RangeWithin, error) { + + log.Printf("CrawlSingleRange [%d, %d]", lowerBound, upperBound) + noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, client: &http.Client{Timeout: gc.client.client.Timeout}, @@ -87,13 +117,16 @@ func (gc githubCrawler) Crawl(ctx context.Context, accessToken: gc.client.accessToken, } + var reProcessQueryRanges []RangeWithin + var ranges []string var err error // Since Github returns a max of 1000 results per query, we can use // multiple queries that split the search space into chunks of at most // 1000 files to get all of the data. for i := 0; i < 5; i++ { - ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query)) + ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query), + lowerBound, upperBound) if err == nil { logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i) break @@ -102,7 +135,7 @@ func (gc githubCrawler) Crawl(ctx context.Context, } } if err != nil { - return fmt.Errorf("could not split %v into ranges, %v\n", + return reProcessQueryRanges, fmt.Errorf("could not split %v into ranges, %v\n", gc.query, err) } @@ -112,20 +145,23 @@ func (gc githubCrawler) Crawl(ctx context.Context, errs := make(multiError, 0) queryResult := RangeQueryResult{} for _, query := range ranges { - rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) + reProcessQuery, rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) if err != nil { errs = append(errs, err) } queryResult.Add(rangeResult) + if reProcessQuery { + reProcessQueryRanges = append(reProcessQueryRanges, RangeSizes(query)) + } } logger.Printf("Summary of Crawl: %s", queryResult.String()) if len(errs) > 0 { - return errs + return reProcessQueryRanges, errs } - return nil + return reProcessQueryRanges, nil } // FetchDocument first tries to fetch the document with d.FilePath. If it fails, @@ -225,7 +261,7 @@ func (r *RangeQueryResult) String() string { // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, output chan<- crawler.CrawledDocument, seen utils.SeenMap, - branchMap map[string]string) (RangeQueryResult, error) { + branchMap map[string]string) (bool, RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -241,6 +277,8 @@ func processQuery(ctx context.Context, gcl GhClient, query string, close(queryPages) }() + reProcessQuery := false + errs := make(multiError, 0) result := RangeQueryResult{} pageID := 1 @@ -271,11 +309,15 @@ func processQuery(ctx context.Context, gcl GhClient, query string, result.Add(pageResult) pageID++ + + if page.Parsed.TotalCount > githubMaxResultsPerQuery { + reProcessQuery = true + } } logger.Printf("Summary of processQuery: %s", result.String()) - return result, errs + return reProcessQuery, result, errs } func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, diff --git a/api/internal/crawl/crawler/github/split_search_ranges.go b/api/internal/crawl/crawler/github/split_search_ranges.go index 8d9037478..81482a1b9 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges.go +++ b/api/internal/crawl/crawler/github/split_search_ranges.go @@ -100,6 +100,8 @@ package github import ( "fmt" "math/bits" + "strconv" + "strings" ) // Files cannot be more than 2^19 bytes, according to @@ -112,7 +114,7 @@ const ( // Interface instead of struct for testing purposes. // Not expecting to have multiple implementations. type cachedSearch interface { - CountResults(uint64) (uint64, error) + CountResults(uint64, uint64) (uint64, error) RequestString(filesize rangeFormatter) string } @@ -161,13 +163,13 @@ func newCache(client GhClient, query Query) githubCachedSearch { } } -func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { +func (c githubCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) { count, cached := c.cache[upperBound] if cached { return count, nil } - sizeRange := RangeWithin{0, upperBound} + sizeRange := RangeWithin{lowerBound, upperBound} rangeRequest := c.RequestString(sizeRange) result := c.gcl.parseGithubResponse(rangeRequest) @@ -238,8 +240,8 @@ func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { // This would mean that the search as it is could not find all files. If queries // are sorted by last indexed, and retrieved on regular intervals, it should be // sufficient to get most if not all documents. -func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { - totalFiles, err := cache.CountResults(githubMaxFileSize) +func FindRangesForRepoSearch(cache cachedSearch, lowerBound, upperBound uint64) ([]string, error) { + totalFiles, err := cache.CountResults(lowerBound, upperBound) if err != nil { return nil, err } @@ -247,7 +249,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { if githubMaxResultsPerQuery >= totalFiles { return []string{ - cache.RequestString(RangeWithin{0, githubMaxFileSize}), + cache.RequestString(RangeWithin{lowerBound, upperBound}), }, nil } @@ -275,6 +277,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { // range. filesAccessible := uint64(0) sizes := make([]uint64, 0) + sizes = append(sizes, lowerBound) for filesAccessible < totalFiles { target := filesAccessible + githubMaxResultsPerQuery if target >= totalFiles { @@ -284,22 +287,22 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { logger.Printf("%d accessible files, next target = %d\n", filesAccessible, target) - cur, err := lowerBoundFileCount(cache, target) + size, err := FindFileSize(cache, target, lowerBound, upperBound) if err != nil { return nil, err } // If there are more than 1000 files in the next bucket, we must // advance anyway and lose out on some files :(. - if l := len(sizes); l > 0 && sizes[l-1] == cur { - cur++ + if l := len(sizes); l > 0 && sizes[l-1] == size { + size++ } - nextAccessible, err := cache.CountResults(cur) + nextAccessible, err := cache.CountResults(lowerBound, size) if err != nil { return nil, fmt.Errorf( "cache should be populated at %d already, got %v", - cur, err) + size, err) } if nextAccessible < filesAccessible { return nil, fmt.Errorf( @@ -309,31 +312,31 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { filesAccessible = nextAccessible if nextAccessible < totalFiles { - sizes = append(sizes, cur) + sizes = append(sizes, size) } } - + sizes = append(sizes, upperBound) return formatFilesizeRanges(cache, sizes), nil } -// lowerBoundFileCount finds the filesize range from [0, return value] that has +// FindFileSize finds the filesize range from [lowerBound, return value] that has // the largest file count that is smaller than or equal to // githubMaxResultsPerQuery. It is important to note that this returned value // could already be in a previous range if the next file size has more than 1000 // results. It is left to the caller to handle this bit of logic and guarantee // forward progession in this case. -func lowerBoundFileCount( - cache cachedSearch, targetFileCount uint64) (uint64, error) { +func FindFileSize( + cache cachedSearch, targetFileCount, lowerBound, upperBound uint64) (uint64, error) { // Binary search for file sizes that make up the next <=1000 element // chunk. - cur := uint64(0) - increase := githubMaxFileSize / 2 + cur := lowerBound + increase := (upperBound - lowerBound) / 2 for increase > 0 { mid := cur + increase - count, err := cache.CountResults(mid) + count, err := cache.CountResults(lowerBound, mid) if err != nil { return count, err } @@ -353,26 +356,24 @@ func lowerBoundFileCount( } func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { - ranges := make([]string, 0, len(sizes)+1) - - if len(sizes) > 0 { - ranges = append(ranges, cache.RequestString( - RangeLessThan{sizes[0] + 1}, - )) + n := len(sizes) + if n < 2 { + return []string{} } - for i := 0; i < len(sizes)-1; i += 1 { - ranges = append(ranges, cache.RequestString( - RangeWithin{sizes[i] + 1, sizes[i+1]}, - )) - - if i != len(sizes)-2 { - continue - } - ranges = append(ranges, cache.RequestString( - RangeGreaterThan{sizes[i+1]}, - )) + ranges := make([]string, 0, n-1) + ranges = append(ranges, cache.RequestString(RangeWithin{sizes[0], sizes[1]})) + for i := 1; i < n-1; i++ { + ranges = append(ranges, cache.RequestString(RangeWithin{sizes[i] + 1, sizes[i+1]})) } - return ranges } + +func RangeSizes(s string) RangeWithin { + start := strings.Index(s, "+size:") + len("+size:") + end := strings.Index(s, "&") + ranges := strings.Split(s[start:end], "..") + lowerBound, _ := strconv.ParseUint(ranges[0], 10, 64) + upperBound, _ := strconv.ParseUint(ranges[1], 10, 64) + return RangeWithin{lowerBound, upperBound} +} diff --git a/api/internal/crawl/crawler/github/split_search_ranges_test.go b/api/internal/crawl/crawler/github/split_search_ranges_test.go index ad332388d..788ef17bc 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges_test.go +++ b/api/internal/crawl/crawler/github/split_search_ranges_test.go @@ -11,7 +11,7 @@ type testCachedSearch struct { cache map[uint64]uint64 } -func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { +func (c testCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) { log.Printf("CountResults(%05x)\n", upperBound) count, ok := c.cache[upperBound] if !ok { @@ -73,19 +73,29 @@ func TestRangeSplitting(t *testing.T) { }, } - requests, err := FindRangesForRepoSearch(cache) + requests, err := FindRangesForRepoSearch(cache, 0, 524288) if err != nil { t.Errorf("Error while finding ranges: %v", err) } expected := []string{ - "<107", // cache.RequestString(RangeLessThan{0x6b}), - "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), - "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), - "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), - ">4095", // cache.RequestString(RangeGreaterThan{0xfff}), + "0..106", // cache.RequestString(RangeWithin{0x00, 0x6a}), + "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), + "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), + "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), + "4096..524288", // cache.RequestString(RangeWithin{0x1000, 0x80000}), } if !reflect.DeepEqual(requests, expected) { t.Errorf("Expected requests (%v) to equal (%v)", requests, expected) } } + +func TestRangeSizes(t *testing.T) { + s := "https://api.github.com/search/code?q=filename:kustomization.yaml+filename:kustomization.yml" + + "+filename:kustomization+size:2365..10000&order=desc&per_page=100&sort=indexed" + returnedResult := RangeSizes(s) + expectedResult := RangeWithin{uint64(2365), uint64(10000)} + if !reflect.DeepEqual(returnedResult, expectedResult) { + t.Errorf("RangeSizes expected (%v), got (%v)",expectedResult, returnedResult) + } +}