From af131c74711420af5c7ca2af8f4c99a4786fdf36 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 14 Jan 2020 15:36:12 -0800 Subject: [PATCH] Use flags to specify crawling mode and github user/repo info --- api/internal/crawl/cmd/crawler/crawler.go | 52 ++++++++----------- .../crawl/config/crawler/job/README.md | 13 +++-- .../crawl/config/crawler/job/job.yaml | 2 +- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index e94afb306..a5306dbfc 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -46,7 +46,7 @@ func NewCrawlMode(s string) CrawlMode { return CrawlUser case "github-repo": return CrawlRepo - case "": + case "index+github": return CrawlIndexAndGithub case "index": return CrawlIndex @@ -57,21 +57,20 @@ func NewCrawlMode(s string) CrawlMode { } } -func Usage() { - fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0]) - fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n") - fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0]) - fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0]) - fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0]) - fmt.Printf("%s github-user : Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0]) - fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0]) - fmt.Printf("%s github-repo : Crawl all the kustomization files in a Github repo\n", os.Args[0]) - fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0]) -} - func main() { indexNamePtr := flag.String( "index", "kustomize", "The name of the ElasticSearch index.") + modePtr := flag.String("mode", "index+github", + `The crawling mode, which can be one of [github-user, github-repo, index, github, index+github]. + * github-user: crawl all the kustomization files in all the repositories of a Github user (--github-user must be specified for this mode). + * github-repo: crawl all the kustomization files in a Github repository (--github-repo must be specified for this mode). + * index: crawl all the documents in the index. + * gihub: crawl all the kustomization files on Github. + * index+github: crawl all the documents in the index and crawling all the kustomization files on Github.`) + githubUserPtr := flag.String("github-user", "", + "A github user name (e.g., kubernetes-sigs). This flag is required for the `github-user` mode.") + githubRepoPtr := flag.String("github-repo", "", + "A github repository name (e.g., kubernetes-sigs/kustomize). This flag is required for the `github-repo` mode.") flag.Parse() githubToken := os.Getenv(githubAccessTokenVar) @@ -128,12 +127,7 @@ func main() { // This helps avoid indexing a given document multiple times. seen := crawler.NewSeenMap() - var mode CrawlMode - if len(os.Args) == 1 { - mode = CrawlIndexAndGithub - } else { - mode = NewCrawlMode(os.Args[1]) - } + mode := NewCrawlMode(*modePtr) ghCrawlerConstructor := func(user, repo string) crawler.Crawler { if user != "" { @@ -192,21 +186,21 @@ func main() { crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUser: - if len(os.Args) < 3 { - Usage() - log.Fatalf("Please specify a github user!") + if *githubUserPtr == "" { + flag.Usage() + log.Fatalf("Please specify a github user with the github-user flag!") } - crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")} + crawlers := []crawler.Crawler{ghCrawlerConstructor(*githubUserPtr, "")} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlRepo: - if len(os.Args) < 3 { - Usage() - log.Fatalf("Please specify a github repo!") + if *githubRepoPtr == "" { + flag.Usage() + log.Fatalf("Please specify a github repository with the github-repo flag!") } - crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])} + crawlers := []crawler.Crawler{ghCrawlerConstructor("", *githubRepoPtr)} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUnknown: - Usage() - log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]") + flag.Usage() + log.Fatalf("The --mode flag must be one of [github-user, github-repo, index, github, index+github].") } } diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md index 3570f27ee..2f1adef7f 100644 --- a/api/internal/crawl/config/crawler/job/README.md +++ b/api/internal/crawl/config/crawler/job/README.md @@ -1,4 +1,4 @@ -There are three ways of running the crawler job. +The crawler job can run in one of the following mode: # Crawling all the documents in the index and crawling all the kustomization files on Github @@ -7,14 +7,13 @@ of the container should be: ``` command: ["/crawler"] - args: [] ``` Or ``` command: ["/crawler"] - args: [""] + args: ["--mode=index+github"] ``` # Crawling all the documents in the index @@ -23,7 +22,7 @@ The `command` and `args` field of the container should be: ``` command: ["/crawler"] - args: ["index"] + args: ["--mode=index"] ``` # Crawling all the kustomization files on Github @@ -32,7 +31,7 @@ The `command` and `args` field of the container should be: ``` command: ["/crawler"] - args: ["github"] + args: ["--mode=github"] ``` # Crawling all the kustomization files in a Github repo @@ -41,7 +40,7 @@ The `command` and `args` field of the container should be like: ``` command: ["/crawler"] - args: ["github-repo", "kubernetes-sigs/kustomize"] + args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize"] ``` # Crawling all the kustomization files in all the repositories of a Github user @@ -50,5 +49,5 @@ The `command` and `args` field of the container should be like: ``` command: ["/crawler"] - args: ["github-user", "kubernetes-sigs"] + args: ["--github-user", "--github-user=kubernetes-sigs"] ``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index 28e36bcb8..23b0bea41 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -11,7 +11,7 @@ spec: image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always command: ["/crawler"] - args: ["github-repo", "kubernetes-sigs/kustomize"] + args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"] env: - name: GITHUB_ACCESS_TOKEN valueFrom: