From 127541f61056f8b2b244c18e8cdf6a9ad2b13efa Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 14:35:44 -0800 Subject: [PATCH] Support diffrent modes of running the crawler --- api/internal/crawl/cmd/crawler/crawler.go | 32 ++++++++++++++- .../crawl/config/crawler/job/README.md | 41 +++++++++++++++++++ .../crawl/config/crawler/job/job.yaml | 2 +- 3 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 api/internal/crawl/config/crawler/job/README.md diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index ea73efd04..7c62522af 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -3,6 +3,7 @@ package main import ( "context" "fmt" + "log" "net/http" "os" "time" @@ -23,11 +24,31 @@ const ( retryCount = 3 githubUserEnv = "GITHUB_USER" githubRepoEnv = "GITHUB_REPO" + crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY" + crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY" ) +// countEnvs count the environment variables whose values are not empty. +func countEnvs(envs ...string) int { + count := 0 + for _, env := range envs { + if env != "" { + count++ + } + } + return count +} + func main() { githubUser := os.Getenv(githubUserEnv) githubRepo := os.Getenv(githubRepoEnv) + crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv) + crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv) + + if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 { + log.Fatalf("only one of [%s, %s, %s, %s] should be set", + githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv) + } githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { @@ -122,6 +143,13 @@ func main() { } crawlers := []crawler.Crawler{ghCrawler} - crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) - crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + + if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" { + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + } else if crawlIndexOnly == "true" { + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + } else { + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + } } diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md new file mode 100644 index 000000000..3fc3e665b --- /dev/null +++ b/api/internal/crawl/config/crawler/job/README.md @@ -0,0 +1,41 @@ +There are three ways of running the crawler job. + +# Crawling all the documents in the index and crawling all the kustomization files on Github + +This is the default setting of the crawler job. + +# Crawling all the documents in the index + +Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this: + +``` + - name: CRAWL_INDEX_ONLY + value: true +``` + +# Crawling all the kustomization files on Github + +Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this: + +``` + - name: CRAWL_GITHUB_ONLY + value: true +``` + +# Crawling all the kustomization files in a Github repo + +Add the environment variable `GITHUB_REPO` into the crawler container. For example: + +``` + - name: GITHUB_REPO + value: kubernetes-sigs/kustomize +``` + +# Crawling all the kustomization files in all the repositories of a Github user + +Add the environment variable `GITHUB_USER` into the crawler container. For example: + +``` + - name: GITHUB_USER + value: kubernetes-sigs +``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index dde0de398..6dd8d4c97 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -8,7 +8,7 @@ spec: restartPolicy: OnFailure containers: - name: crawler - image: gcr.io/kustomize-search/crawler:latest + image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always env: - name: GITHUB_ACCESS_TOKEN