Remove unused param from IndexFunc

This commit is contained in:
Haiyan Meng
2019-12-17 14:56:52 -08:00
parent 127541f610
commit be2e03681d
5 changed files with 135 additions and 71 deletions

View File

@@ -22,34 +22,53 @@ const (
redisCacheURL = "REDIS_CACHE_URL" redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL" redisKeyURL = "REDIS_KEY_URL"
retryCount = 3 retryCount = 3
githubUserEnv = "GITHUB_USER"
githubRepoEnv = "GITHUB_REPO"
crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY"
crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY"
) )
// countEnvs count the environment variables whose values are not empty. type CrawlMode int
func countEnvs(envs ...string) int { const (
count := 0 CrawlUnknown CrawlMode = iota
for _, env := range envs { // Crawl all the kustomization files in all the repositories of a Github user
if env != "" { CrawlUser
count++ // Crawl all the kustomization files in a Github repo
CrawlRepo
// Crawl all the documents in the index
CrawlIndex
// Crawl all the kustomization files on Github
CrawlGithub
// Crawl all the documents in the index and crawling all the kustomization files on Github
CrawlIndexAndGithub
)
func NewCrawlMode(s string) CrawlMode {
switch s {
case "github-user":
return CrawlUser
case "github-repo":
return CrawlRepo
case "":
return CrawlIndexAndGithub
case "index":
return CrawlIndex
case "github":
return CrawlGithub
default:
return CrawlUnknown
} }
} }
return count
func Usage() {
fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0])
fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n")
fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0])
fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s github-user <github-user>: Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0])
fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0])
fmt.Printf("%s github-repo <github-repo>: Crawl all the kustomization files in a Github repo\n", os.Args[0])
fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0])
} }
func main() { func main() {
githubUser := os.Getenv(githubUserEnv)
githubRepo := os.Getenv(githubRepoEnv)
crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv)
crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv)
if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 {
log.Fatalf("only one of [%s, %s, %s, %s] should be set",
githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv)
}
githubToken := os.Getenv(githubAccessTokenVar) githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" { if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n", fmt.Printf("Must set the variable '%s' to make github requests.\n",
@@ -64,8 +83,6 @@ func main() {
return return
} }
seedDocs := make(crawler.CrawlSeed, 0)
cacheURL := os.Getenv(redisCacheURL) cacheURL := os.Getenv(redisCacheURL)
cache, err := redis.DialURL(cacheURL) cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{} clientCache := &http.Client{}
@@ -86,7 +103,7 @@ func main() {
} }
// Index updates the value in the index. // Index updates the value in the index.
indexFunc := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { indexFunc := func(cdoc crawler.CrawledDocument, mode index.Mode) error {
switch d := cdoc.(type) { switch d := cdoc.(type) {
case *doc.KustomizationDocument: case *doc.KustomizationDocument:
switch mode { switch mode {
@@ -106,30 +123,41 @@ func main() {
// This helps avoid indexing a given document multiple times. // This helps avoid indexing a given document multiple times.
seen := make(map[string]struct{}) seen := make(map[string]struct{})
var ghCrawler crawler.Crawler var mode CrawlMode
if len(os.Args) == 1 {
mode = CrawlIndexAndGithub
} else {
mode = NewCrawlMode(os.Args[1])
}
if githubRepo != "" { ghCrawlerConstructor := func(user, repo string) crawler.Crawler {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, if user != "" {
return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith( github.QueryWith(
github.Filename("kustomization.yaml"), github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"), github.Filename("kustomization.yml"),
github.Repo(githubRepo)), github.User(user)),
) )
} else if githubUser != "" { } else if repo != "" {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith( github.QueryWith(
github.Filename("kustomization.yaml"), github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"), github.Filename("kustomization.yml"),
github.User(githubUser)), github.Repo(repo)),
) )
} else { } else {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith( github.QueryWith(
github.Filename("kustomization.yaml"), github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")), github.Filename("kustomization.yml")),
) )
}
}
seedDocs := make(crawler.CrawlSeed, 0)
// get all the documents in the index // get all the documents in the index
getSeedDocsFunc := func() {
query := []byte(`{ "query":{ "match_all":{} } }`) query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second) it := idx.IterateQuery(query, 10000, 60*time.Second)
for it.Next() { for it.Next() {
@@ -142,14 +170,35 @@ func main() {
} }
} }
crawlers := []crawler.Crawler{ghCrawler} switch mode {
case CrawlIndexAndGithub:
if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" { getSeedDocsFunc()
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
} else if crawlIndexOnly == "true" {
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
} else {
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlIndex:
getSeedDocsFunc()
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
case CrawlGithub:
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github user!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlRepo:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github repo!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUnknown:
Usage()
log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]")
} }
} }

View File

@@ -2,40 +2,53 @@ There are three ways of running the crawler job.
# Crawling all the documents in the index and crawling all the kustomization files on Github # Crawling all the documents in the index and crawling all the kustomization files on Github
This is the default setting of the crawler job. This is the default setting of the crawler job. The `command` and `args` field
of the container should be:
```
command: ["/crawler"]
args: []
```
Or
```
command: ["/crawler"]
args: [""]
```
# Crawling all the documents in the index # Crawling all the documents in the index
Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this: The `command` and `args` field of the container should be:
``` ```
- name: CRAWL_INDEX_ONLY command: ["/crawler"]
value: true args: ["index"]
``` ```
# Crawling all the kustomization files on Github # Crawling all the kustomization files on Github
Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this: The `command` and `args` field of the container should be:
``` ```
- name: CRAWL_GITHUB_ONLY command: ["/crawler"]
value: true args: ["github"]
``` ```
# Crawling all the kustomization files in a Github repo # Crawling all the kustomization files in a Github repo
Add the environment variable `GITHUB_REPO` into the crawler container. For example: The `command` and `args` field of the container should be like:
``` ```
- name: GITHUB_REPO command: ["/crawler"]
value: kubernetes-sigs/kustomize args: ["github-repo", "kubernetes-sigs/kustomize"]
``` ```
# Crawling all the kustomization files in all the repositories of a Github user # Crawling all the kustomization files in all the repositories of a Github user
Add the environment variable `GITHUB_USER` into the crawler container. For example: The `command` and `args` field of the container should be like:
``` ```
- name: GITHUB_USER command: ["/crawler"]
value: kubernetes-sigs args: ["github-user", "kubernetes-sigs"]
``` ```

View File

@@ -10,6 +10,8 @@ spec:
- name: crawler - name: crawler
image: gcr.io/haiyanmeng-gke-dev/crawler:v1 image: gcr.io/haiyanmeng-gke-dev/crawler:v1
imagePullPolicy: Always imagePullPolicy: Always
command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"]
env: env:
- name: GITHUB_ACCESS_TOKEN - name: GITHUB_ACCESS_TOKEN
valueFrom: valueFrom:

View File

@@ -49,7 +49,7 @@ type CrawledDocument interface {
type CrawlSeed []*doc.Document type CrawlSeed []*doc.Document
type IndexFunc func(CrawledDocument, Crawler, index.Mode) error type IndexFunc func(CrawledDocument, index.Mode) error
type Converter func(*doc.Document) (CrawledDocument, error) type Converter func(*doc.Document) (CrawledDocument, error)
func logIfErr(err error) { func logIfErr(err error) {
@@ -74,7 +74,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen[cdoc.ID()] = struct{}{} seen[cdoc.ID()] = struct{}{}
// Insert into index // Insert into index
if err := indx(cdoc, match, index.InsertOrUpdate); err != nil { if err := indx(cdoc, index.InsertOrUpdate); err != nil {
logger.Printf("Failed to insert or update %s %s: %v", logger.Printf("Failed to insert or update %s %s: %v",
cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err) cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err)
return return
@@ -142,7 +142,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
Document: *tail, Document: *tail,
} }
seen[cdoc.ID()] = struct{}{} seen[cdoc.ID()] = struct{}{}
if err := indx(cdoc, match, index.Delete); err != nil { if err := indx(cdoc, index.Delete); err != nil {
logger.Printf("Failed to delete %s %s: %v", logger.Printf("Failed to delete %s %s: %v",
cdoc.RepositoryURL, cdoc.FilePath, err) cdoc.RepositoryURL, cdoc.FilePath, err)
} }

View File

@@ -318,7 +318,7 @@ resources:
Document: *d, Document: *d,
}, nil }, nil
}, },
func(d CrawledDocument, cr Crawler, mode index.Mode) error { func(d CrawledDocument, mode index.Mode) error {
visited[d.ID()]++ visited[d.ID()]++
return nil return nil
}, },