diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 63df31409..4a7883e54 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -2,6 +2,7 @@ package main import ( "context" + "flag" "fmt" "log" "net/http" @@ -45,7 +46,7 @@ func NewCrawlMode(s string) CrawlMode { return CrawlUser case "github-repo": return CrawlRepo - case "": + case "index+github": return CrawlIndexAndGithub case "index": return CrawlIndex @@ -56,30 +57,33 @@ func NewCrawlMode(s string) CrawlMode { } } -func Usage() { - fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0]) - fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n") - fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0]) - fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0]) - fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0]) - fmt.Printf("%s github-user : Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0]) - fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0]) - fmt.Printf("%s github-repo : Crawl all the kustomization files in a Github repo\n", os.Args[0]) - fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0]) -} - func main() { + indexNamePtr := flag.String( + "index", "kustomize", "The name of the ElasticSearch index.") + modePtr := flag.String("mode", "index+github", + `The crawling mode, which can be one of [github-user, github-repo, index, github, index+github]. + * github-user: crawl all the kustomization files in all the repositories of a Github user (--github-user must be specified for this mode). + * github-repo: crawl all the kustomization files in a Github repository (--github-repo must be specified for this mode). + * index: crawl all the documents in the index. + * gihub: crawl all the kustomization files on Github. + * index+github: crawl all the documents in the index and crawling all the kustomization files on Github.`) + githubUserPtr := flag.String("github-user", "", + "A github user name (e.g., kubernetes-sigs). This flag is required for the `github-user` mode.") + githubRepoPtr := flag.String("github-repo", "", + "A github repository name (e.g., kubernetes-sigs/kustomize). This flag is required for the `github-repo` mode.") + flag.Parse() + githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { - fmt.Printf("Must set the variable '%s' to make github requests.\n", + log.Printf("Must set the variable '%s' to make github requests.\n", githubAccessTokenVar) return } ctx := context.Background() - idx, err := index.NewKustomizeIndex(ctx) + idx, err := index.NewKustomizeIndex(ctx, *indexNamePtr) if err != nil { - fmt.Printf("Could not create an index: %v\n", err) + log.Printf("Could not create an index: %v\n", err) return } @@ -87,7 +91,7 @@ func main() { cache, err := redis.DialURL(cacheURL) clientCache := &http.Client{} if err != nil { - fmt.Printf("Error: redis could not make a connection: %v\n", err) + log.Printf("Error: redis could not make a connection: %v\n", err) } else { clientCache = httpclient.NewClient(cache) } @@ -108,10 +112,10 @@ func main() { case *doc.KustomizationDocument: switch mode { case index.Delete: - fmt.Println("Deleting: ", d) + log.Printf("Deleting: %v", d) return idx.Delete(d.ID()) default: - fmt.Println("Inserting: ", d) + log.Printf("Inserting: %v", d) return idx.Put(d.ID(), d) } default: @@ -123,12 +127,7 @@ func main() { // This helps avoid indexing a given document multiple times. seen := crawler.NewSeenMap() - var mode CrawlMode - if len(os.Args) == 1 { - mode = CrawlIndexAndGithub - } else { - mode = NewCrawlMode(os.Args[1]) - } + mode := NewCrawlMode(*modePtr) ghCrawlerConstructor := func(user, repo string) crawler.Crawler { if user != "" { @@ -169,7 +168,7 @@ func main() { } } if err := it.Err(); err != nil { - fmt.Printf("Error iterating: %v\n", err) + log.Fatalf("getSeedDocsFunc Error iterating: %v\n", err) } } @@ -187,21 +186,21 @@ func main() { crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUser: - if len(os.Args) < 3 { - Usage() - log.Fatalf("Please specify a github user!") + if *githubUserPtr == "" { + flag.Usage() + log.Fatalf("Please specify a github user with the github-user flag!") } - crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")} + crawlers := []crawler.Crawler{ghCrawlerConstructor(*githubUserPtr, "")} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlRepo: - if len(os.Args) < 3 { - Usage() - log.Fatalf("Please specify a github repo!") + if *githubRepoPtr == "" { + flag.Usage() + log.Fatalf("Please specify a github repository with the github-repo flag!") } - crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])} + crawlers := []crawler.Crawler{ghCrawlerConstructor("", *githubRepoPtr)} crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUnknown: - Usage() - log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]") + flag.Usage() + log.Fatalf("The --mode flag must be one of [github-user, github-repo, index, github, index+github].") } } diff --git a/api/internal/crawl/cmd/log-parser/main.go b/api/internal/crawl/cmd/log-parser/main.go index ac1a00f33..e442c8349 100644 --- a/api/internal/crawl/cmd/log-parser/main.go +++ b/api/internal/crawl/cmd/log-parser/main.go @@ -36,6 +36,7 @@ func main() { m := entry.(map[string]interface{}) if payload, ok := m["textPayload"]; ok { + // use fmt.Printf here instead of log.Printf to avoid the time and code location info the log package provides fmt.Printf("%s", payload) } else { log.Printf("the log entry does not have the `textPayload` field: %s\n", line) diff --git a/api/internal/crawl/config/base/kustomization.yaml b/api/internal/crawl/config/base/kustomization.yaml index 12c894869..eebc9a91e 100644 --- a/api/internal/crawl/config/base/kustomization.yaml +++ b/api/internal/crawl/config/base/kustomization.yaml @@ -2,5 +2,4 @@ configmapGenerator: - name: elasticsearch-config literals: - es-url="http://esbasic-master:9200" - - kustomize-index-name="kustomize" - plugin-index-name="plugin" diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md index 3570f27ee..2f1adef7f 100644 --- a/api/internal/crawl/config/crawler/job/README.md +++ b/api/internal/crawl/config/crawler/job/README.md @@ -1,4 +1,4 @@ -There are three ways of running the crawler job. +The crawler job can run in one of the following mode: # Crawling all the documents in the index and crawling all the kustomization files on Github @@ -7,14 +7,13 @@ of the container should be: ``` command: ["/crawler"] - args: [] ``` Or ``` command: ["/crawler"] - args: [""] + args: ["--mode=index+github"] ``` # Crawling all the documents in the index @@ -23,7 +22,7 @@ The `command` and `args` field of the container should be: ``` command: ["/crawler"] - args: ["index"] + args: ["--mode=index"] ``` # Crawling all the kustomization files on Github @@ -32,7 +31,7 @@ The `command` and `args` field of the container should be: ``` command: ["/crawler"] - args: ["github"] + args: ["--mode=github"] ``` # Crawling all the kustomization files in a Github repo @@ -41,7 +40,7 @@ The `command` and `args` field of the container should be like: ``` command: ["/crawler"] - args: ["github-repo", "kubernetes-sigs/kustomize"] + args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize"] ``` # Crawling all the kustomization files in all the repositories of a Github user @@ -50,5 +49,5 @@ The `command` and `args` field of the container should be like: ``` command: ["/crawler"] - args: ["github-user", "kubernetes-sigs"] + args: ["--github-user", "--github-user=kubernetes-sigs"] ``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index 28e36bcb8..23b0bea41 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -11,7 +11,7 @@ spec: image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always command: ["/crawler"] - args: ["github-repo", "kubernetes-sigs/kustomize"] + args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"] env: - name: GITHUB_ACCESS_TOKEN valueFrom: diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 383e834a5..7dace4da6 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "log" "reflect" "sort" "strings" @@ -110,7 +111,7 @@ func (s sortableDocs) Len() int { } func TestCrawlGithubRunner(t *testing.T) { - fmt.Println("testing CrawlGithubRunner") + log.Println("testing CrawlGithubRunner") tests := []struct { tc []Crawler errs []error @@ -216,7 +217,7 @@ func TestCrawlGithubRunner(t *testing.T) { } func TestCrawlFromSeed(t *testing.T) { - fmt.Println("testing CrawlFromSeed") + log.Println("testing CrawlFromSeed") tests := []struct { seed CrawlSeed diff --git a/api/internal/crawl/crawler/github/split_search_ranges_test.go b/api/internal/crawl/crawler/github/split_search_ranges_test.go index c175486e6..ad332388d 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges_test.go +++ b/api/internal/crawl/crawler/github/split_search_ranges_test.go @@ -2,6 +2,7 @@ package github import ( "fmt" + "log" "reflect" "testing" ) @@ -11,7 +12,7 @@ type testCachedSearch struct { } func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { - fmt.Printf("CountResults(%05x)\n", upperBound) + log.Printf("CountResults(%05x)\n", upperBound) count, ok := c.cache[upperBound] if !ok { return count, fmt.Errorf("cache not set at %x", upperBound) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index ec2e031ba..4a709f693 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -2,6 +2,7 @@ package doc import ( "fmt" + "log" "sort" "strings" @@ -83,7 +84,7 @@ func (doc *KustomizationDocument) GetResources() ([]*Document, error) { } next, err := doc.Document.FromRelativePath(r) if err != nil { - fmt.Printf("GetResources error: %v\n", err) + log.Printf("GetResources error: %v\n", err) continue } res = append(res, &next) diff --git a/api/internal/crawl/index/kustomize.go b/api/internal/crawl/index/kustomize.go index cedea28bb..e55c5547e 100644 --- a/api/internal/crawl/index/kustomize.go +++ b/api/internal/crawl/index/kustomize.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "io/ioutil" + "log" "strings" "time" @@ -97,14 +98,14 @@ type KustomizeIndex struct { } // Create index reference to the index containing the kustomize documents. -func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) { - idx, err := newIndex(ctx, "kustomize") +func NewKustomizeIndex(ctx context.Context, indexName string) (*KustomizeIndex, error) { + idx, err := newIndex(ctx, indexName) if err != nil { return nil, err } indicesExistsOp := idx.client.Indices.Exists - resp, err := indicesExistsOp([]string{"kustomize"}, + resp, err := indicesExistsOp([]string{indexName}, indicesExistsOp.WithContext(idx.ctx), indicesExistsOp.WithPretty()) if err != nil { @@ -112,9 +113,9 @@ func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) { } if resp.StatusCode == 200 { - fmt.Printf("The kustomize index already exists\n") + log.Printf("The %s index already exists", indexName) } else { - fmt.Printf("Creating the kustomize index\n") + log.Printf("Creating the %s index\n", indexName) if err := idx.CreateIndex([]byte(IndexConfig)); err != nil { return nil, err } @@ -252,7 +253,7 @@ func (it *KustomizeIterator) Next() bool { } if it.err == nil { - fmt.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID) + log.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID) it.err = it.update(*it.scrollImpl.ScrollID, reader) } @@ -341,7 +342,7 @@ func (ki *KustomizeIndex) Search(query string, if err != nil { return nil, fmt.Errorf("failed to format query %s", query) } - fmt.Printf("formated query: %s\n", data) + log.Printf("formated query: %s\n", data) var kr ElasticKustomizeResult err = ki.index.Search(data, opts.SearchOptions, func(results io.Reader) error {