diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 95e1a15b7..7e14f7072 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -21,9 +21,14 @@ const ( redisCacheURL = "REDIS_CACHE_URL" redisKeyURL = "REDIS_KEY_URL" retryCount = 3 + githubUserEnv = "GITHUB_USER" + githubRepoEnv = "GITHUB_REPO" ) func main() { + githubUser := os.Getenv(githubUserEnv) + githubRepo := os.Getenv(githubRepoEnv) + githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { fmt.Printf("Must set the variable '%s' to make github requests.\n", @@ -38,21 +43,9 @@ func main() { return } + seedDocs := make(crawler.CrawlSeed, 0) + cacheURL := os.Getenv(redisCacheURL) - - query := []byte(`{ "query":{ "match_all":{} } }`) - it := idx.IterateQuery(query, 10000, 60*time.Second) - docs := make(crawler.CrawlSeed, 0) - for it.Next() { - for _, hit := range it.Value().Hits.Hits { - docs = append(docs, hit.Document.Copy()) - } - } - - if err := it.Err(); err != nil { - fmt.Printf("Error iterating: %v\n", err) - } - cache, err := redis.DialURL(cacheURL) clientCache := &http.Client{} if err != nil { @@ -61,12 +54,6 @@ func main() { clientCache = httpclient.NewClient(cache) } - ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml")), - ) - // docConverter takes in a plain document and processes it for the index. docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) { kdoc := doc.KustomizationDocument{ @@ -81,7 +68,7 @@ func main() { index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: - fmt.Println("Inserting: ", d.ID(), d) + fmt.Println("Inserting: ", d) _, err := idx.Put(d.ID(), d) return err default: @@ -93,9 +80,43 @@ func main() { // This helps avoid indexing a given document multiple times. seen := make(map[string]struct{}) + var ghCrawler crawler.Crawler + + if githubRepo != "" { + ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.Repo(githubRepo)), + ) + } else if githubUser != "" { + ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.User(githubUser)), + ) + } else { + ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + + // get all the documents in the index + query := []byte(`{ "query":{ "match_all":{} } }`) + it := idx.IterateQuery(query, 10000, 60*time.Second) + for it.Next() { + for _, hit := range it.Value().Hits.Hits { + seedDocs = append(seedDocs, hit.Document.Copy()) + } + } + if err := it.Err(); err != nil { + fmt.Printf("Error iterating: %v\n", err) + } + } + crawlers := []crawler.Crawler{ghCrawler} - - crawler.CrawlFromSeed(ctx, docs, crawlers, docConverter, index, seen) - + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, index, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen) } diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 312b0168d..f57f247e5 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -68,11 +68,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler { func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen map[string]struct{}, stack *CrawlSeed) { - if _, ok := seen[cdoc.ID()]; ok { - return - } seen[cdoc.ID()] = struct{}{} + // Insert into index err := indx(cdoc, match) logIfErr(err) @@ -98,12 +96,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C docCount := 0 // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { - docCount++ // get the last Document in (*docPtr), which will be crawled in this iteration. tail := (*docsPtr)[len(*docsPtr)-1] // remove the last Document in (*docPtr) - *docsPtr = (*docsPtr)[:(len(*docsPtr)-1)] + *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] + + if _, ok := seen[tail.ID()]; ok { + continue + } + docCount++ match := findMatch(tail, crawlers) if match == nil { @@ -138,7 +140,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C // CrawlFromSeed updates all the documents in seed, and crawls all the new // documents referred in the seed. func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, - conv Converter, indx IndexFunc, seen map[string]struct{}) { + conv Converter, indx IndexFunc, seen map[string]struct{}) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -218,7 +220,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, // CrawlGithub crawls all the kustomization files on Github. func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, - indx IndexFunc, seen map[string]struct{}) { + indx IndexFunc, seen map[string]struct{}) { // stack tracks the documents directly referred in other documents. stack := make(CrawlSeed, 0) @@ -244,14 +246,15 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, } }() + logger.Println("processing the documents found from crawling github") if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil { for _, err := range errs { logIfErr(err) } } close(ch) - logger.Println("Processing the documents found from crawling github") wg.Wait() + // Handle deps of newly discovered documents. logger.Printf("crawling the %d new documents referred by other documents", len(stack)) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index a42c33074..66fa9b20c 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -230,7 +230,6 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) ( RepositoryURL: k.Repository.URL, }, } - logger.Printf("Set the creationTime field") creationTime, err := gcl.GetFileCreationTime(k) if err != nil { logger.Printf("GetFileCreationTime failed: %v", err) @@ -533,7 +532,7 @@ func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo { } // SearchGithubAPI performs a search query and handles rate limitting for -// the 'code/search?' endpoint as well as timed retries in the case of abuse +// the 'search/code?' endpoint as well as timed retries in the case of abuse // prevention. func (gcl GhClient) SearchGithubAPI(query string) (*http.Response, error) { throttleSearchAPI() diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 319526daf..98eb1b9f3 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -90,6 +90,17 @@ func Path(p string) queryField { return queryField{name: "path", value: p} } +// Repo takes a repository (i.e., kubernetes-sigs/kustomize) and formats +// it according to the Github API. +func Repo(r string) queryField { + return queryField{name: "repo", value: r} +} + +// Path takes a github username and formats it according to the Github API. +func User(u string) queryField { + return queryField{name: "user", value: u} +} + // RequestConfig stores common variables that must be present for the queries. // - CodeSearchRequests: ask Github to check the code indices given a query. // - ContentsRequests: ask Github where to download a resource given a repo and a diff --git a/api/internal/crawl/crawler/github/queries_test.go b/api/internal/crawl/crawler/github/queries_test.go index a5b7e820c..07c9036b6 100644 --- a/api/internal/crawl/crawler/github/queries_test.go +++ b/api/internal/crawl/crawler/github/queries_test.go @@ -53,8 +53,11 @@ func TestQueryType(t *testing.T) { Filename("kustomization.yaml"), Keyword("keyword1"), Keyword("keyword2"), + Repo("user1/repo1"), + User("user1"), ), - expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", + expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2+" + + "repo:user1/repo1+user:user1", }, } diff --git a/api/internal/crawl/crawler/github/split_search_ranges.go b/api/internal/crawl/crawler/github/split_search_ranges.go index 919d3135d..1322932f4 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges.go +++ b/api/internal/crawl/crawler/github/split_search_ranges.go @@ -243,7 +243,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { if err != nil { return nil, err } - logger.Println("total files: ", totalFiles) + logger.Println("total kustomization files: ", totalFiles) if githubMaxResultsPerQuery >= totalFiles { return []string{