Merge pull request #2102 from haiyanmeng/seed

Use flags for configuring the crawler job
This commit is contained in:
Kubernetes Prow Robot
2020-01-14 17:02:36 -08:00
committed by GitHub
9 changed files with 58 additions and 56 deletions

View File

@@ -2,6 +2,7 @@ package main
import (
"context"
"flag"
"fmt"
"log"
"net/http"
@@ -45,7 +46,7 @@ func NewCrawlMode(s string) CrawlMode {
return CrawlUser
case "github-repo":
return CrawlRepo
case "":
case "index+github":
return CrawlIndexAndGithub
case "index":
return CrawlIndex
@@ -56,30 +57,33 @@ func NewCrawlMode(s string) CrawlMode {
}
}
func Usage() {
fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0])
fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n")
fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0])
fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s github-user <github-user>: Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0])
fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0])
fmt.Printf("%s github-repo <github-repo>: Crawl all the kustomization files in a Github repo\n", os.Args[0])
fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0])
}
func main() {
indexNamePtr := flag.String(
"index", "kustomize", "The name of the ElasticSearch index.")
modePtr := flag.String("mode", "index+github",
`The crawling mode, which can be one of [github-user, github-repo, index, github, index+github].
* github-user: crawl all the kustomization files in all the repositories of a Github user (--github-user must be specified for this mode).
* github-repo: crawl all the kustomization files in a Github repository (--github-repo must be specified for this mode).
* index: crawl all the documents in the index.
* gihub: crawl all the kustomization files on Github.
* index+github: crawl all the documents in the index and crawling all the kustomization files on Github.`)
githubUserPtr := flag.String("github-user", "",
"A github user name (e.g., kubernetes-sigs). This flag is required for the `github-user` mode.")
githubRepoPtr := flag.String("github-repo", "",
"A github repository name (e.g., kubernetes-sigs/kustomize). This flag is required for the `github-repo` mode.")
flag.Parse()
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n",
log.Printf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar)
return
}
ctx := context.Background()
idx, err := index.NewKustomizeIndex(ctx)
idx, err := index.NewKustomizeIndex(ctx, *indexNamePtr)
if err != nil {
fmt.Printf("Could not create an index: %v\n", err)
log.Printf("Could not create an index: %v\n", err)
return
}
@@ -87,7 +91,7 @@ func main() {
cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{}
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
log.Printf("Error: redis could not make a connection: %v\n", err)
} else {
clientCache = httpclient.NewClient(cache)
}
@@ -108,10 +112,10 @@ func main() {
case *doc.KustomizationDocument:
switch mode {
case index.Delete:
fmt.Println("Deleting: ", d)
log.Printf("Deleting: %v", d)
return idx.Delete(d.ID())
default:
fmt.Println("Inserting: ", d)
log.Printf("Inserting: %v", d)
return idx.Put(d.ID(), d)
}
default:
@@ -123,12 +127,7 @@ func main() {
// This helps avoid indexing a given document multiple times.
seen := crawler.NewSeenMap()
var mode CrawlMode
if len(os.Args) == 1 {
mode = CrawlIndexAndGithub
} else {
mode = NewCrawlMode(os.Args[1])
}
mode := NewCrawlMode(*modePtr)
ghCrawlerConstructor := func(user, repo string) crawler.Crawler {
if user != "" {
@@ -169,7 +168,7 @@ func main() {
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
log.Fatalf("getSeedDocsFunc Error iterating: %v\n", err)
}
}
@@ -187,21 +186,21 @@ func main() {
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github user!")
if *githubUserPtr == "" {
flag.Usage()
log.Fatalf("Please specify a github user with the github-user flag!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")}
crawlers := []crawler.Crawler{ghCrawlerConstructor(*githubUserPtr, "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlRepo:
if len(os.Args) < 3 {
Usage()
log.Fatalf("Please specify a github repo!")
if *githubRepoPtr == "" {
flag.Usage()
log.Fatalf("Please specify a github repository with the github-repo flag!")
}
crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])}
crawlers := []crawler.Crawler{ghCrawlerConstructor("", *githubRepoPtr)}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUnknown:
Usage()
log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]")
flag.Usage()
log.Fatalf("The --mode flag must be one of [github-user, github-repo, index, github, index+github].")
}
}

View File

@@ -36,6 +36,7 @@ func main() {
m := entry.(map[string]interface{})
if payload, ok := m["textPayload"]; ok {
// use fmt.Printf here instead of log.Printf to avoid the time and code location info the log package provides
fmt.Printf("%s", payload)
} else {
log.Printf("the log entry does not have the `textPayload` field: %s\n", line)

View File

@@ -2,5 +2,4 @@ configmapGenerator:
- name: elasticsearch-config
literals:
- es-url="http://esbasic-master:9200"
- kustomize-index-name="kustomize"
- plugin-index-name="plugin"

View File

@@ -1,4 +1,4 @@
There are three ways of running the crawler job.
The crawler job can run in one of the following mode:
# Crawling all the documents in the index and crawling all the kustomization files on Github
@@ -7,14 +7,13 @@ of the container should be:
```
command: ["/crawler"]
args: []
```
Or
```
command: ["/crawler"]
args: [""]
args: ["--mode=index+github"]
```
# Crawling all the documents in the index
@@ -23,7 +22,7 @@ The `command` and `args` field of the container should be:
```
command: ["/crawler"]
args: ["index"]
args: ["--mode=index"]
```
# Crawling all the kustomization files on Github
@@ -32,7 +31,7 @@ The `command` and `args` field of the container should be:
```
command: ["/crawler"]
args: ["github"]
args: ["--mode=github"]
```
# Crawling all the kustomization files in a Github repo
@@ -41,7 +40,7 @@ The `command` and `args` field of the container should be like:
```
command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"]
args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize"]
```
# Crawling all the kustomization files in all the repositories of a Github user
@@ -50,5 +49,5 @@ The `command` and `args` field of the container should be like:
```
command: ["/crawler"]
args: ["github-user", "kubernetes-sigs"]
args: ["--github-user", "--github-user=kubernetes-sigs"]
```

View File

@@ -11,7 +11,7 @@ spec:
image: gcr.io/haiyanmeng-gke-dev/crawler:v1
imagePullPolicy: Always
command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"]
args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"]
env:
- name: GITHUB_ACCESS_TOKEN
valueFrom:

View File

@@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"log"
"reflect"
"sort"
"strings"
@@ -110,7 +111,7 @@ func (s sortableDocs) Len() int {
}
func TestCrawlGithubRunner(t *testing.T) {
fmt.Println("testing CrawlGithubRunner")
log.Println("testing CrawlGithubRunner")
tests := []struct {
tc []Crawler
errs []error
@@ -216,7 +217,7 @@ func TestCrawlGithubRunner(t *testing.T) {
}
func TestCrawlFromSeed(t *testing.T) {
fmt.Println("testing CrawlFromSeed")
log.Println("testing CrawlFromSeed")
tests := []struct {
seed CrawlSeed

View File

@@ -2,6 +2,7 @@ package github
import (
"fmt"
"log"
"reflect"
"testing"
)
@@ -11,7 +12,7 @@ type testCachedSearch struct {
}
func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
fmt.Printf("CountResults(%05x)\n", upperBound)
log.Printf("CountResults(%05x)\n", upperBound)
count, ok := c.cache[upperBound]
if !ok {
return count, fmt.Errorf("cache not set at %x", upperBound)

View File

@@ -2,6 +2,7 @@ package doc
import (
"fmt"
"log"
"sort"
"strings"
@@ -83,7 +84,7 @@ func (doc *KustomizationDocument) GetResources() ([]*Document, error) {
}
next, err := doc.Document.FromRelativePath(r)
if err != nil {
fmt.Printf("GetResources error: %v\n", err)
log.Printf("GetResources error: %v\n", err)
continue
}
res = append(res, &next)

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"io/ioutil"
"log"
"strings"
"time"
@@ -97,14 +98,14 @@ type KustomizeIndex struct {
}
// Create index reference to the index containing the kustomize documents.
func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) {
idx, err := newIndex(ctx, "kustomize")
func NewKustomizeIndex(ctx context.Context, indexName string) (*KustomizeIndex, error) {
idx, err := newIndex(ctx, indexName)
if err != nil {
return nil, err
}
indicesExistsOp := idx.client.Indices.Exists
resp, err := indicesExistsOp([]string{"kustomize"},
resp, err := indicesExistsOp([]string{indexName},
indicesExistsOp.WithContext(idx.ctx),
indicesExistsOp.WithPretty())
if err != nil {
@@ -112,9 +113,9 @@ func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) {
}
if resp.StatusCode == 200 {
fmt.Printf("The kustomize index already exists\n")
log.Printf("The %s index already exists", indexName)
} else {
fmt.Printf("Creating the kustomize index\n")
log.Printf("Creating the %s index\n", indexName)
if err := idx.CreateIndex([]byte(IndexConfig)); err != nil {
return nil, err
}
@@ -252,7 +253,7 @@ func (it *KustomizeIterator) Next() bool {
}
if it.err == nil {
fmt.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID)
log.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID)
it.err = it.update(*it.scrollImpl.ScrollID, reader)
}
@@ -341,7 +342,7 @@ func (ki *KustomizeIndex) Search(query string,
if err != nil {
return nil, fmt.Errorf("failed to format query %s", query)
}
fmt.Printf("formated query: %s\n", data)
log.Printf("formated query: %s\n", data)
var kr ElasticKustomizeResult
err = ki.index.Search(data, opts.SearchOptions, func(results io.Reader) error {