Add supports for crawling a specific git user or repo

This commit is contained in:
Haiyan Meng
2019-12-12 15:20:13 -08:00
parent 50ce2a66a3
commit a9244f759e
6 changed files with 73 additions and 36 deletions

View File

@@ -21,9 +21,14 @@ const (
redisCacheURL = "REDIS_CACHE_URL" redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL" redisKeyURL = "REDIS_KEY_URL"
retryCount = 3 retryCount = 3
githubUserEnv = "GITHUB_USER"
githubRepoEnv = "GITHUB_REPO"
) )
func main() { func main() {
githubUser := os.Getenv(githubUserEnv)
githubRepo := os.Getenv(githubRepoEnv)
githubToken := os.Getenv(githubAccessTokenVar) githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" { if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n", fmt.Printf("Must set the variable '%s' to make github requests.\n",
@@ -38,21 +43,9 @@ func main() {
return return
} }
seedDocs := make(crawler.CrawlSeed, 0)
cacheURL := os.Getenv(redisCacheURL) cacheURL := os.Getenv(redisCacheURL)
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
docs := make(crawler.CrawlSeed, 0)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
docs = append(docs, hit.Document.Copy())
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
}
cache, err := redis.DialURL(cacheURL) cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{} clientCache := &http.Client{}
if err != nil { if err != nil {
@@ -61,12 +54,6 @@ func main() {
clientCache = httpclient.NewClient(cache) clientCache = httpclient.NewClient(cache)
} }
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
// docConverter takes in a plain document and processes it for the index. // docConverter takes in a plain document and processes it for the index.
docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) { docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) {
kdoc := doc.KustomizationDocument{ kdoc := doc.KustomizationDocument{
@@ -81,7 +68,7 @@ func main() {
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
switch d := cdoc.(type) { switch d := cdoc.(type) {
case *doc.KustomizationDocument: case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d.ID(), d) fmt.Println("Inserting: ", d)
_, err := idx.Put(d.ID(), d) _, err := idx.Put(d.ID(), d)
return err return err
default: default:
@@ -93,9 +80,43 @@ func main() {
// This helps avoid indexing a given document multiple times. // This helps avoid indexing a given document multiple times.
seen := make(map[string]struct{}) seen := make(map[string]struct{})
var ghCrawler crawler.Crawler
if githubRepo != "" {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.Repo(githubRepo)),
)
} else if githubUser != "" {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.User(githubUser)),
)
} else {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
// get all the documents in the index
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
seedDocs = append(seedDocs, hit.Document.Copy())
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
}
}
crawlers := []crawler.Crawler{ghCrawler} crawlers := []crawler.Crawler{ghCrawler}
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, index, seen)
crawler.CrawlFromSeed(ctx, docs, crawlers, docConverter, index, seen)
crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen)
} }

View File

@@ -68,11 +68,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen map[string]struct{}, stack *CrawlSeed) { seen map[string]struct{}, stack *CrawlSeed) {
if _, ok := seen[cdoc.ID()]; ok {
return
}
seen[cdoc.ID()] = struct{}{} seen[cdoc.ID()] = struct{}{}
// Insert into index // Insert into index
err := indx(cdoc, match) err := indx(cdoc, match)
logIfErr(err) logIfErr(err)
@@ -98,12 +96,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
docCount := 0 docCount := 0
// During the execution of the for loop, more Documents may be added into (*docsPtr). // During the execution of the for loop, more Documents may be added into (*docsPtr).
for len(*docsPtr) > 0 { for len(*docsPtr) > 0 {
docCount++
// get the last Document in (*docPtr), which will be crawled in this iteration. // get the last Document in (*docPtr), which will be crawled in this iteration.
tail := (*docsPtr)[len(*docsPtr)-1] tail := (*docsPtr)[len(*docsPtr)-1]
// remove the last Document in (*docPtr) // remove the last Document in (*docPtr)
*docsPtr = (*docsPtr)[:(len(*docsPtr)-1)] *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)]
if _, ok := seen[tail.ID()]; ok {
continue
}
docCount++
match := findMatch(tail, crawlers) match := findMatch(tail, crawlers)
if match == nil { if match == nil {
@@ -138,7 +140,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// CrawlFromSeed updates all the documents in seed, and crawls all the new // CrawlFromSeed updates all the documents in seed, and crawls all the new
// documents referred in the seed. // documents referred in the seed.
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
conv Converter, indx IndexFunc, seen map[string]struct{}) { conv Converter, indx IndexFunc, seen map[string]struct{}) {
// stack tracks the documents directly referred in other documents. // stack tracks the documents directly referred in other documents.
stack := make(CrawlSeed, 0) stack := make(CrawlSeed, 0)
@@ -218,7 +220,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
// CrawlGithub crawls all the kustomization files on Github. // CrawlGithub crawls all the kustomization files on Github.
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
indx IndexFunc, seen map[string]struct{}) { indx IndexFunc, seen map[string]struct{}) {
// stack tracks the documents directly referred in other documents. // stack tracks the documents directly referred in other documents.
stack := make(CrawlSeed, 0) stack := make(CrawlSeed, 0)
@@ -244,14 +246,15 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
} }
}() }()
logger.Println("processing the documents found from crawling github")
if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil { if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil {
for _, err := range errs { for _, err := range errs {
logIfErr(err) logIfErr(err)
} }
} }
close(ch) close(ch)
logger.Println("Processing the documents found from crawling github")
wg.Wait() wg.Wait()
// Handle deps of newly discovered documents. // Handle deps of newly discovered documents.
logger.Printf("crawling the %d new documents referred by other documents", logger.Printf("crawling the %d new documents referred by other documents",
len(stack)) len(stack))

View File

@@ -230,7 +230,6 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) (
RepositoryURL: k.Repository.URL, RepositoryURL: k.Repository.URL,
}, },
} }
logger.Printf("Set the creationTime field")
creationTime, err := gcl.GetFileCreationTime(k) creationTime, err := gcl.GetFileCreationTime(k)
if err != nil { if err != nil {
logger.Printf("GetFileCreationTime failed: %v", err) logger.Printf("GetFileCreationTime failed: %v", err)
@@ -533,7 +532,7 @@ func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo {
} }
// SearchGithubAPI performs a search query and handles rate limitting for // SearchGithubAPI performs a search query and handles rate limitting for
// the 'code/search?' endpoint as well as timed retries in the case of abuse // the 'search/code?' endpoint as well as timed retries in the case of abuse
// prevention. // prevention.
func (gcl GhClient) SearchGithubAPI(query string) (*http.Response, error) { func (gcl GhClient) SearchGithubAPI(query string) (*http.Response, error) {
throttleSearchAPI() throttleSearchAPI()

View File

@@ -90,6 +90,17 @@ func Path(p string) queryField {
return queryField{name: "path", value: p} return queryField{name: "path", value: p}
} }
// Repo takes a repository (i.e., kubernetes-sigs/kustomize) and formats
// it according to the Github API.
func Repo(r string) queryField {
return queryField{name: "repo", value: r}
}
// Path takes a github username and formats it according to the Github API.
func User(u string) queryField {
return queryField{name: "user", value: u}
}
// RequestConfig stores common variables that must be present for the queries. // RequestConfig stores common variables that must be present for the queries.
// - CodeSearchRequests: ask Github to check the code indices given a query. // - CodeSearchRequests: ask Github to check the code indices given a query.
// - ContentsRequests: ask Github where to download a resource given a repo and a // - ContentsRequests: ask Github where to download a resource given a repo and a

View File

@@ -53,8 +53,11 @@ func TestQueryType(t *testing.T) {
Filename("kustomization.yaml"), Filename("kustomization.yaml"),
Keyword("keyword1"), Keyword("keyword1"),
Keyword("keyword2"), Keyword("keyword2"),
Repo("user1/repo1"),
User("user1"),
), ),
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2+" +
"repo:user1/repo1+user:user1",
}, },
} }

View File

@@ -243,7 +243,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
logger.Println("total files: ", totalFiles) logger.Println("total kustomization files: ", totalFiles)
if githubMaxResultsPerQuery >= totalFiles { if githubMaxResultsPerQuery >= totalFiles {
return []string{ return []string{