From be2e03681d5151c39cb1764290cfd7a98ae69f9d Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 14:56:52 -0800 Subject: [PATCH] Remove unused param from IndexFunc --- api/internal/crawl/cmd/crawler/crawler.go | 157 ++++++++++++------ .../crawl/config/crawler/job/README.md | 39 +++-- .../crawl/config/crawler/job/job.yaml | 2 + api/internal/crawl/crawler/crawler.go | 6 +- api/internal/crawl/crawler/crawler_test.go | 2 +- 5 files changed, 135 insertions(+), 71 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 7c62522af..fdbafeafd 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -22,34 +22,53 @@ const ( redisCacheURL = "REDIS_CACHE_URL" redisKeyURL = "REDIS_KEY_URL" retryCount = 3 - githubUserEnv = "GITHUB_USER" - githubRepoEnv = "GITHUB_REPO" - crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY" - crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY" ) -// countEnvs count the environment variables whose values are not empty. -func countEnvs(envs ...string) int { - count := 0 - for _, env := range envs { - if env != "" { - count++ - } +type CrawlMode int +const ( + CrawlUnknown CrawlMode = iota + // Crawl all the kustomization files in all the repositories of a Github user + CrawlUser + // Crawl all the kustomization files in a Github repo + CrawlRepo + // Crawl all the documents in the index + CrawlIndex + // Crawl all the kustomization files on Github + CrawlGithub + // Crawl all the documents in the index and crawling all the kustomization files on Github + CrawlIndexAndGithub +) + +func NewCrawlMode(s string) CrawlMode { + switch s { + case "github-user": + return CrawlUser + case "github-repo": + return CrawlRepo + case "": + return CrawlIndexAndGithub + case "index": + return CrawlIndex + case "github": + return CrawlGithub + default: + return CrawlUnknown } - return count +} + +func Usage() { + fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0]) + fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n") + fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0]) + fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0]) + fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0]) + fmt.Printf("%s github-user : Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0]) + fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0]) + fmt.Printf("%s github-repo : Crawl all the kustomization files in a Github repo\n", os.Args[0]) + fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0]) } func main() { - githubUser := os.Getenv(githubUserEnv) - githubRepo := os.Getenv(githubRepoEnv) - crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv) - crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv) - - if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 { - log.Fatalf("only one of [%s, %s, %s, %s] should be set", - githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv) - } - githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { fmt.Printf("Must set the variable '%s' to make github requests.\n", @@ -64,8 +83,6 @@ func main() { return } - seedDocs := make(crawler.CrawlSeed, 0) - cacheURL := os.Getenv(redisCacheURL) cache, err := redis.DialURL(cacheURL) clientCache := &http.Client{} @@ -86,7 +103,7 @@ func main() { } // Index updates the value in the index. - indexFunc := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { + indexFunc := func(cdoc crawler.CrawledDocument, mode index.Mode) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: switch mode { @@ -106,30 +123,41 @@ func main() { // This helps avoid indexing a given document multiple times. seen := make(map[string]struct{}) - var ghCrawler crawler.Crawler - - if githubRepo != "" { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml"), - github.Repo(githubRepo)), - ) - } else if githubUser != "" { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml"), - github.User(githubUser)), - ) + var mode CrawlMode + if len(os.Args) == 1 { + mode = CrawlIndexAndGithub } else { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml")), - ) + mode = NewCrawlMode(os.Args[1]) + } - // get all the documents in the index + ghCrawlerConstructor := func(user, repo string) crawler.Crawler { + if user != "" { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.User(user)), + ) + } else if repo != "" { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.Repo(repo)), + ) + } else { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + } + } + + seedDocs := make(crawler.CrawlSeed, 0) + + // get all the documents in the index + getSeedDocsFunc := func() { query := []byte(`{ "query":{ "match_all":{} } }`) it := idx.IterateQuery(query, 10000, 60*time.Second) for it.Next() { @@ -142,14 +170,35 @@ func main() { } } - crawlers := []crawler.Crawler{ghCrawler} - - if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" { - crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) - } else if crawlIndexOnly == "true" { - crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) - } else { + switch mode { + case CrawlIndexAndGithub: + getSeedDocsFunc() + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlIndex: + getSeedDocsFunc() + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + case CrawlGithub: + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlUser: + if len(os.Args) < 3 { + Usage() + log.Fatalf("Please specify a github user!") + } + crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlRepo: + if len(os.Args) < 3 { + Usage() + log.Fatalf("Please specify a github repo!") + } + crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlUnknown: + Usage() + log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]") } } diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md index 3fc3e665b..3570f27ee 100644 --- a/api/internal/crawl/config/crawler/job/README.md +++ b/api/internal/crawl/config/crawler/job/README.md @@ -2,40 +2,53 @@ There are three ways of running the crawler job. # Crawling all the documents in the index and crawling all the kustomization files on Github -This is the default setting of the crawler job. +This is the default setting of the crawler job. The `command` and `args` field +of the container should be: + +``` + command: ["/crawler"] + args: [] +``` + +Or + +``` + command: ["/crawler"] + args: [""] +``` # Crawling all the documents in the index -Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this: +The `command` and `args` field of the container should be: ``` - - name: CRAWL_INDEX_ONLY - value: true + command: ["/crawler"] + args: ["index"] ``` # Crawling all the kustomization files on Github -Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this: +The `command` and `args` field of the container should be: ``` - - name: CRAWL_GITHUB_ONLY - value: true + command: ["/crawler"] + args: ["github"] ``` # Crawling all the kustomization files in a Github repo -Add the environment variable `GITHUB_REPO` into the crawler container. For example: +The `command` and `args` field of the container should be like: ``` - - name: GITHUB_REPO - value: kubernetes-sigs/kustomize + command: ["/crawler"] + args: ["github-repo", "kubernetes-sigs/kustomize"] ``` # Crawling all the kustomization files in all the repositories of a Github user -Add the environment variable `GITHUB_USER` into the crawler container. For example: +The `command` and `args` field of the container should be like: ``` - - name: GITHUB_USER - value: kubernetes-sigs + command: ["/crawler"] + args: ["github-user", "kubernetes-sigs"] ``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index 6dd8d4c97..28e36bcb8 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -10,6 +10,8 @@ spec: - name: crawler image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always + command: ["/crawler"] + args: ["github-repo", "kubernetes-sigs/kustomize"] env: - name: GITHUB_ACCESS_TOKEN valueFrom: diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index d46cf161d..31cabc2b7 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -49,7 +49,7 @@ type CrawledDocument interface { type CrawlSeed []*doc.Document -type IndexFunc func(CrawledDocument, Crawler, index.Mode) error +type IndexFunc func(CrawledDocument, index.Mode) error type Converter func(*doc.Document) (CrawledDocument, error) func logIfErr(err error) { @@ -74,7 +74,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen[cdoc.ID()] = struct{}{} // Insert into index - if err := indx(cdoc, match, index.InsertOrUpdate); err != nil { + if err := indx(cdoc, index.InsertOrUpdate); err != nil { logger.Printf("Failed to insert or update %s %s: %v", cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err) return @@ -142,7 +142,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C Document: *tail, } seen[cdoc.ID()] = struct{}{} - if err := indx(cdoc, match, index.Delete); err != nil { + if err := indx(cdoc, index.Delete); err != nil { logger.Printf("Failed to delete %s %s: %v", cdoc.RepositoryURL, cdoc.FilePath, err) } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 00a619c46..41a848612 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -318,7 +318,7 @@ resources: Document: *d, }, nil }, - func(d CrawledDocument, cr Crawler, mode index.Mode) error { + func(d CrawledDocument, mode index.Mode) error { visited[d.ID()]++ return nil },