mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 17:12:51 +00:00
Add supports for crawling a specific git user or repo
This commit is contained in:
@@ -21,9 +21,14 @@ const (
|
|||||||
redisCacheURL = "REDIS_CACHE_URL"
|
redisCacheURL = "REDIS_CACHE_URL"
|
||||||
redisKeyURL = "REDIS_KEY_URL"
|
redisKeyURL = "REDIS_KEY_URL"
|
||||||
retryCount = 3
|
retryCount = 3
|
||||||
|
githubUserEnv = "GITHUB_USER"
|
||||||
|
githubRepoEnv = "GITHUB_REPO"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
githubUser := os.Getenv(githubUserEnv)
|
||||||
|
githubRepo := os.Getenv(githubRepoEnv)
|
||||||
|
|
||||||
githubToken := os.Getenv(githubAccessTokenVar)
|
githubToken := os.Getenv(githubAccessTokenVar)
|
||||||
if githubToken == "" {
|
if githubToken == "" {
|
||||||
fmt.Printf("Must set the variable '%s' to make github requests.\n",
|
fmt.Printf("Must set the variable '%s' to make github requests.\n",
|
||||||
@@ -38,21 +43,9 @@ func main() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seedDocs := make(crawler.CrawlSeed, 0)
|
||||||
|
|
||||||
cacheURL := os.Getenv(redisCacheURL)
|
cacheURL := os.Getenv(redisCacheURL)
|
||||||
|
|
||||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
|
||||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
|
||||||
docs := make(crawler.CrawlSeed, 0)
|
|
||||||
for it.Next() {
|
|
||||||
for _, hit := range it.Value().Hits.Hits {
|
|
||||||
docs = append(docs, hit.Document.Copy())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := it.Err(); err != nil {
|
|
||||||
fmt.Printf("Error iterating: %v\n", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cache, err := redis.DialURL(cacheURL)
|
cache, err := redis.DialURL(cacheURL)
|
||||||
clientCache := &http.Client{}
|
clientCache := &http.Client{}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -61,12 +54,6 @@ func main() {
|
|||||||
clientCache = httpclient.NewClient(cache)
|
clientCache = httpclient.NewClient(cache)
|
||||||
}
|
}
|
||||||
|
|
||||||
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
|
|
||||||
github.QueryWith(
|
|
||||||
github.Filename("kustomization.yaml"),
|
|
||||||
github.Filename("kustomization.yml")),
|
|
||||||
)
|
|
||||||
|
|
||||||
// docConverter takes in a plain document and processes it for the index.
|
// docConverter takes in a plain document and processes it for the index.
|
||||||
docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) {
|
docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) {
|
||||||
kdoc := doc.KustomizationDocument{
|
kdoc := doc.KustomizationDocument{
|
||||||
@@ -81,7 +68,7 @@ func main() {
|
|||||||
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
||||||
switch d := cdoc.(type) {
|
switch d := cdoc.(type) {
|
||||||
case *doc.KustomizationDocument:
|
case *doc.KustomizationDocument:
|
||||||
fmt.Println("Inserting: ", d.ID(), d)
|
fmt.Println("Inserting: ", d)
|
||||||
_, err := idx.Put(d.ID(), d)
|
_, err := idx.Put(d.ID(), d)
|
||||||
return err
|
return err
|
||||||
default:
|
default:
|
||||||
@@ -93,9 +80,43 @@ func main() {
|
|||||||
// This helps avoid indexing a given document multiple times.
|
// This helps avoid indexing a given document multiple times.
|
||||||
seen := make(map[string]struct{})
|
seen := make(map[string]struct{})
|
||||||
|
|
||||||
|
var ghCrawler crawler.Crawler
|
||||||
|
|
||||||
|
if githubRepo != "" {
|
||||||
|
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
|
||||||
|
github.QueryWith(
|
||||||
|
github.Filename("kustomization.yaml"),
|
||||||
|
github.Filename("kustomization.yml"),
|
||||||
|
github.Repo(githubRepo)),
|
||||||
|
)
|
||||||
|
} else if githubUser != "" {
|
||||||
|
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
|
||||||
|
github.QueryWith(
|
||||||
|
github.Filename("kustomization.yaml"),
|
||||||
|
github.Filename("kustomization.yml"),
|
||||||
|
github.User(githubUser)),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
|
||||||
|
github.QueryWith(
|
||||||
|
github.Filename("kustomization.yaml"),
|
||||||
|
github.Filename("kustomization.yml")),
|
||||||
|
)
|
||||||
|
|
||||||
|
// get all the documents in the index
|
||||||
|
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||||
|
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||||
|
for it.Next() {
|
||||||
|
for _, hit := range it.Value().Hits.Hits {
|
||||||
|
seedDocs = append(seedDocs, hit.Document.Copy())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := it.Err(); err != nil {
|
||||||
|
fmt.Printf("Error iterating: %v\n", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
crawlers := []crawler.Crawler{ghCrawler}
|
crawlers := []crawler.Crawler{ghCrawler}
|
||||||
|
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, index, seen)
|
||||||
crawler.CrawlFromSeed(ctx, docs, crawlers, docConverter, index, seen)
|
|
||||||
|
|
||||||
crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen)
|
crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,11 +68,9 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
|
|||||||
|
|
||||||
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
||||||
seen map[string]struct{}, stack *CrawlSeed) {
|
seen map[string]struct{}, stack *CrawlSeed) {
|
||||||
if _, ok := seen[cdoc.ID()]; ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
seen[cdoc.ID()] = struct{}{}
|
seen[cdoc.ID()] = struct{}{}
|
||||||
|
|
||||||
// Insert into index
|
// Insert into index
|
||||||
err := indx(cdoc, match)
|
err := indx(cdoc, match)
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
@@ -98,12 +96,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
|||||||
docCount := 0
|
docCount := 0
|
||||||
// During the execution of the for loop, more Documents may be added into (*docsPtr).
|
// During the execution of the for loop, more Documents may be added into (*docsPtr).
|
||||||
for len(*docsPtr) > 0 {
|
for len(*docsPtr) > 0 {
|
||||||
docCount++
|
|
||||||
// get the last Document in (*docPtr), which will be crawled in this iteration.
|
// get the last Document in (*docPtr), which will be crawled in this iteration.
|
||||||
tail := (*docsPtr)[len(*docsPtr)-1]
|
tail := (*docsPtr)[len(*docsPtr)-1]
|
||||||
|
|
||||||
// remove the last Document in (*docPtr)
|
// remove the last Document in (*docPtr)
|
||||||
*docsPtr = (*docsPtr)[:(len(*docsPtr)-1)]
|
*docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)]
|
||||||
|
|
||||||
|
if _, ok := seen[tail.ID()]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
docCount++
|
||||||
|
|
||||||
match := findMatch(tail, crawlers)
|
match := findMatch(tail, crawlers)
|
||||||
if match == nil {
|
if match == nil {
|
||||||
@@ -138,7 +140,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
|||||||
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
||||||
// documents referred in the seed.
|
// documents referred in the seed.
|
||||||
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
||||||
conv Converter, indx IndexFunc, seen map[string]struct{}) {
|
conv Converter, indx IndexFunc, seen map[string]struct{}) {
|
||||||
|
|
||||||
// stack tracks the documents directly referred in other documents.
|
// stack tracks the documents directly referred in other documents.
|
||||||
stack := make(CrawlSeed, 0)
|
stack := make(CrawlSeed, 0)
|
||||||
@@ -218,7 +220,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
|
|||||||
|
|
||||||
// CrawlGithub crawls all the kustomization files on Github.
|
// CrawlGithub crawls all the kustomization files on Github.
|
||||||
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||||
indx IndexFunc, seen map[string]struct{}) {
|
indx IndexFunc, seen map[string]struct{}) {
|
||||||
// stack tracks the documents directly referred in other documents.
|
// stack tracks the documents directly referred in other documents.
|
||||||
stack := make(CrawlSeed, 0)
|
stack := make(CrawlSeed, 0)
|
||||||
|
|
||||||
@@ -244,14 +246,15 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
logger.Println("processing the documents found from crawling github")
|
||||||
if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil {
|
if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil {
|
||||||
for _, err := range errs {
|
for _, err := range errs {
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
close(ch)
|
close(ch)
|
||||||
logger.Println("Processing the documents found from crawling github")
|
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
// Handle deps of newly discovered documents.
|
// Handle deps of newly discovered documents.
|
||||||
logger.Printf("crawling the %d new documents referred by other documents",
|
logger.Printf("crawling the %d new documents referred by other documents",
|
||||||
len(stack))
|
len(stack))
|
||||||
|
|||||||
@@ -230,7 +230,6 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) (
|
|||||||
RepositoryURL: k.Repository.URL,
|
RepositoryURL: k.Repository.URL,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
logger.Printf("Set the creationTime field")
|
|
||||||
creationTime, err := gcl.GetFileCreationTime(k)
|
creationTime, err := gcl.GetFileCreationTime(k)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Printf("GetFileCreationTime failed: %v", err)
|
logger.Printf("GetFileCreationTime failed: %v", err)
|
||||||
@@ -533,7 +532,7 @@ func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SearchGithubAPI performs a search query and handles rate limitting for
|
// SearchGithubAPI performs a search query and handles rate limitting for
|
||||||
// the 'code/search?' endpoint as well as timed retries in the case of abuse
|
// the 'search/code?' endpoint as well as timed retries in the case of abuse
|
||||||
// prevention.
|
// prevention.
|
||||||
func (gcl GhClient) SearchGithubAPI(query string) (*http.Response, error) {
|
func (gcl GhClient) SearchGithubAPI(query string) (*http.Response, error) {
|
||||||
throttleSearchAPI()
|
throttleSearchAPI()
|
||||||
|
|||||||
@@ -90,6 +90,17 @@ func Path(p string) queryField {
|
|||||||
return queryField{name: "path", value: p}
|
return queryField{name: "path", value: p}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Repo takes a repository (i.e., kubernetes-sigs/kustomize) and formats
|
||||||
|
// it according to the Github API.
|
||||||
|
func Repo(r string) queryField {
|
||||||
|
return queryField{name: "repo", value: r}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Path takes a github username and formats it according to the Github API.
|
||||||
|
func User(u string) queryField {
|
||||||
|
return queryField{name: "user", value: u}
|
||||||
|
}
|
||||||
|
|
||||||
// RequestConfig stores common variables that must be present for the queries.
|
// RequestConfig stores common variables that must be present for the queries.
|
||||||
// - CodeSearchRequests: ask Github to check the code indices given a query.
|
// - CodeSearchRequests: ask Github to check the code indices given a query.
|
||||||
// - ContentsRequests: ask Github where to download a resource given a repo and a
|
// - ContentsRequests: ask Github where to download a resource given a repo and a
|
||||||
|
|||||||
@@ -53,8 +53,11 @@ func TestQueryType(t *testing.T) {
|
|||||||
Filename("kustomization.yaml"),
|
Filename("kustomization.yaml"),
|
||||||
Keyword("keyword1"),
|
Keyword("keyword1"),
|
||||||
Keyword("keyword2"),
|
Keyword("keyword2"),
|
||||||
|
Repo("user1/repo1"),
|
||||||
|
User("user1"),
|
||||||
),
|
),
|
||||||
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
|
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2+" +
|
||||||
|
"repo:user1/repo1+user:user1",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -243,7 +243,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
logger.Println("total files: ", totalFiles)
|
logger.Println("total kustomization files: ", totalFiles)
|
||||||
|
|
||||||
if githubMaxResultsPerQuery >= totalFiles {
|
if githubMaxResultsPerQuery >= totalFiles {
|
||||||
return []string{
|
return []string{
|
||||||
|
|||||||
Reference in New Issue
Block a user