Remove unused param from IndexFunc

This commit is contained in:
Haiyan Meng
2019-12-17 14:56:52 -08:00
parent 127541f610
commit be2e03681d
5 changed files with 135 additions and 71 deletions

View File

@@ -22,34 +22,53 @@ const (
redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL"
retryCount = 3
githubUserEnv = "GITHUB_USER"
githubRepoEnv = "GITHUB_REPO"
crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY"
crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY"
)
// countEnvs count the environment variables whose values are not empty.
func countEnvs(envs ...string) int {
count := 0
for _, env := range envs {
if env != "" {
count++
}
type CrawlMode int
const (
CrawlUnknown CrawlMode = iota
// Crawl all the kustomization files in all the repositories of a Github user
CrawlUser
// Crawl all the kustomization files in a Github repo
CrawlRepo
// Crawl all the documents in the index
CrawlIndex
// Crawl all the kustomization files on Github
CrawlGithub
// Crawl all the documents in the index and crawling all the kustomization files on Github
CrawlIndexAndGithub
)
func NewCrawlMode(s string) CrawlMode {
switch s {
case "github-user":
return CrawlUser
case "github-repo":
return CrawlRepo
case "":
return CrawlIndexAndGithub
case "index":
return CrawlIndex
case "github":
return CrawlGithub
default:
return CrawlUnknown
}
return count
}
func Usage() {
fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0])
fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n")
fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0])
fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s github-user <github-user>: Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0])
fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0])
fmt.Printf("%s github-repo <github-repo>: Crawl all the kustomization files in a Github repo\n", os.Args[0])
fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0])
}
func main() {
githubUser := os.Getenv(githubUserEnv)
githubRepo := os.Getenv(githubRepoEnv)
crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv)
crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv)
if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 {
log.Fatalf("only one of [%s, %s, %s, %s] should be set",
githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv)
}
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n",
@@ -64,8 +83,6 @@ func main() {
return
}
seedDocs := make(crawler.CrawlSeed, 0)
cacheURL := os.Getenv(redisCacheURL)
cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{}
@@ -86,7 +103,7 @@ func main() {
}
// Index updates the value in the index.
indexFunc := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error {
indexFunc := func(cdoc crawler.CrawledDocument, mode index.Mode) error {
switch d := cdoc.(type) {
case *doc.KustomizationDocument:
switch mode {
@@ -106,30 +123,41 @@ func main() {
// This helps avoid indexing a given document multiple times.
seen := make(map[string]struct{})
var ghCrawler crawler.Crawler
if githubRepo != "" {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.Repo(githubRepo)),
)
} else if githubUser != "" {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.User(githubUser)),
)
var mode CrawlMode
if len(os.Args) == 1 {
mode = CrawlIndexAndGithub
} else {
ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
mode = NewCrawlMode(os.Args[1])
}
// get all the documents in the index
ghCrawlerConstructor := func(user, repo string) crawler.Crawler {
if user != "" {
return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.User(user)),
)
} else if repo != "" {
return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"),
github.Repo(repo)),
)
} else {
return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
}
}
seedDocs := make(crawler.CrawlSeed, 0)
// get all the documents in the index
getSeedDocsFunc := func() {
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
for it.Next() {
@@ -142,14 +170,35 @@ func main() {
}
}
crawlers := []crawler.Crawler{ghCrawler}
if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" {
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
} else if crawlIndexOnly == "true" {
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
} else {
switch mode {
case CrawlIndexAndGithub:
getSeedDocsFunc()
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlIndex:
getSeedDocsFunc()
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
case CrawlGithub:
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github user!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlRepo:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github repo!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUnknown:
Usage()
log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]")
}
}

View File

@@ -2,40 +2,53 @@ There are three ways of running the crawler job.
# Crawling all the documents in the index and crawling all the kustomization files on Github
This is the default setting of the crawler job.
This is the default setting of the crawler job. The `command` and `args` field
of the container should be:
```
command: ["/crawler"]
args: []
```
Or
```
command: ["/crawler"]
args: [""]
```
# Crawling all the documents in the index
Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this:
The `command` and `args` field of the container should be:
```
- name: CRAWL_INDEX_ONLY
value: true
command: ["/crawler"]
args: ["index"]
```
# Crawling all the kustomization files on Github
Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this:
The `command` and `args` field of the container should be:
```
- name: CRAWL_GITHUB_ONLY
value: true
command: ["/crawler"]
args: ["github"]
```
# Crawling all the kustomization files in a Github repo
Add the environment variable `GITHUB_REPO` into the crawler container. For example:
The `command` and `args` field of the container should be like:
```
- name: GITHUB_REPO
value: kubernetes-sigs/kustomize
command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"]
```
# Crawling all the kustomization files in all the repositories of a Github user
Add the environment variable `GITHUB_USER` into the crawler container. For example:
The `command` and `args` field of the container should be like:
```
- name: GITHUB_USER
value: kubernetes-sigs
command: ["/crawler"]
args: ["github-user", "kubernetes-sigs"]
```

View File

@@ -10,6 +10,8 @@ spec:
- name: crawler
image: gcr.io/haiyanmeng-gke-dev/crawler:v1
imagePullPolicy: Always
command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"]
env:
- name: GITHUB_ACCESS_TOKEN
valueFrom:

View File

@@ -49,7 +49,7 @@ type CrawledDocument interface {
type CrawlSeed []*doc.Document
type IndexFunc func(CrawledDocument, Crawler, index.Mode) error
type IndexFunc func(CrawledDocument, index.Mode) error
type Converter func(*doc.Document) (CrawledDocument, error)
func logIfErr(err error) {
@@ -74,7 +74,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen[cdoc.ID()] = struct{}{}
// Insert into index
if err := indx(cdoc, match, index.InsertOrUpdate); err != nil {
if err := indx(cdoc, index.InsertOrUpdate); err != nil {
logger.Printf("Failed to insert or update %s %s: %v",
cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err)
return
@@ -142,7 +142,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
Document: *tail,
}
seen[cdoc.ID()] = struct{}{}
if err := indx(cdoc, match, index.Delete); err != nil {
if err := indx(cdoc, index.Delete); err != nil {
logger.Printf("Failed to delete %s %s: %v",
cdoc.RepositoryURL, cdoc.FilePath, err)
}

View File

@@ -318,7 +318,7 @@ resources:
Document: *d,
}, nil
},
func(d CrawledDocument, cr Crawler, mode index.Mode) error {
func(d CrawledDocument, mode index.Mode) error {
visited[d.ID()]++
return nil
},