Merge pull request #2102 from haiyanmeng/seed

Use flags for configuring the crawler job
This commit is contained in:
Kubernetes Prow Robot
2020-01-14 17:02:36 -08:00
committed by GitHub
9 changed files with 58 additions and 56 deletions

View File

@@ -2,6 +2,7 @@ package main
import ( import (
"context" "context"
"flag"
"fmt" "fmt"
"log" "log"
"net/http" "net/http"
@@ -45,7 +46,7 @@ func NewCrawlMode(s string) CrawlMode {
return CrawlUser return CrawlUser
case "github-repo": case "github-repo":
return CrawlRepo return CrawlRepo
case "": case "index+github":
return CrawlIndexAndGithub return CrawlIndexAndGithub
case "index": case "index":
return CrawlIndex return CrawlIndex
@@ -56,30 +57,33 @@ func NewCrawlMode(s string) CrawlMode {
} }
} }
func Usage() {
fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0])
fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n")
fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0])
fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0])
fmt.Printf("%s github-user <github-user>: Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0])
fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0])
fmt.Printf("%s github-repo <github-repo>: Crawl all the kustomization files in a Github repo\n", os.Args[0])
fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0])
}
func main() { func main() {
indexNamePtr := flag.String(
"index", "kustomize", "The name of the ElasticSearch index.")
modePtr := flag.String("mode", "index+github",
`The crawling mode, which can be one of [github-user, github-repo, index, github, index+github].
* github-user: crawl all the kustomization files in all the repositories of a Github user (--github-user must be specified for this mode).
* github-repo: crawl all the kustomization files in a Github repository (--github-repo must be specified for this mode).
* index: crawl all the documents in the index.
* gihub: crawl all the kustomization files on Github.
* index+github: crawl all the documents in the index and crawling all the kustomization files on Github.`)
githubUserPtr := flag.String("github-user", "",
"A github user name (e.g., kubernetes-sigs). This flag is required for the `github-user` mode.")
githubRepoPtr := flag.String("github-repo", "",
"A github repository name (e.g., kubernetes-sigs/kustomize). This flag is required for the `github-repo` mode.")
flag.Parse()
githubToken := os.Getenv(githubAccessTokenVar) githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" { if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n", log.Printf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar) githubAccessTokenVar)
return return
} }
ctx := context.Background() ctx := context.Background()
idx, err := index.NewKustomizeIndex(ctx) idx, err := index.NewKustomizeIndex(ctx, *indexNamePtr)
if err != nil { if err != nil {
fmt.Printf("Could not create an index: %v\n", err) log.Printf("Could not create an index: %v\n", err)
return return
} }
@@ -87,7 +91,7 @@ func main() {
cache, err := redis.DialURL(cacheURL) cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{} clientCache := &http.Client{}
if err != nil { if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err) log.Printf("Error: redis could not make a connection: %v\n", err)
} else { } else {
clientCache = httpclient.NewClient(cache) clientCache = httpclient.NewClient(cache)
} }
@@ -108,10 +112,10 @@ func main() {
case *doc.KustomizationDocument: case *doc.KustomizationDocument:
switch mode { switch mode {
case index.Delete: case index.Delete:
fmt.Println("Deleting: ", d) log.Printf("Deleting: %v", d)
return idx.Delete(d.ID()) return idx.Delete(d.ID())
default: default:
fmt.Println("Inserting: ", d) log.Printf("Inserting: %v", d)
return idx.Put(d.ID(), d) return idx.Put(d.ID(), d)
} }
default: default:
@@ -123,12 +127,7 @@ func main() {
// This helps avoid indexing a given document multiple times. // This helps avoid indexing a given document multiple times.
seen := crawler.NewSeenMap() seen := crawler.NewSeenMap()
var mode CrawlMode mode := NewCrawlMode(*modePtr)
if len(os.Args) == 1 {
mode = CrawlIndexAndGithub
} else {
mode = NewCrawlMode(os.Args[1])
}
ghCrawlerConstructor := func(user, repo string) crawler.Crawler { ghCrawlerConstructor := func(user, repo string) crawler.Crawler {
if user != "" { if user != "" {
@@ -169,7 +168,7 @@ func main() {
} }
} }
if err := it.Err(); err != nil { if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err) log.Fatalf("getSeedDocsFunc Error iterating: %v\n", err)
} }
} }
@@ -187,21 +186,21 @@ func main() {
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser: case CrawlUser:
if len(os.Args) < 3 { if *githubUserPtr == "" {
Usage() flag.Usage()
log.Fatalf("Please specify a github user!") log.Fatalf("Please specify a github user with the github-user flag!")
} }
crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")} crawlers := []crawler.Crawler{ghCrawlerConstructor(*githubUserPtr, "")}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlRepo: case CrawlRepo:
if len(os.Args) < 3 { if *githubRepoPtr == "" {
Usage() flag.Usage()
log.Fatalf("Please specify a github repo!") log.Fatalf("Please specify a github repository with the github-repo flag!")
} }
crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])} crawlers := []crawler.Crawler{ghCrawlerConstructor("", *githubRepoPtr)}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUnknown: case CrawlUnknown:
Usage() flag.Usage()
log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]") log.Fatalf("The --mode flag must be one of [github-user, github-repo, index, github, index+github].")
} }
} }

View File

@@ -36,6 +36,7 @@ func main() {
m := entry.(map[string]interface{}) m := entry.(map[string]interface{})
if payload, ok := m["textPayload"]; ok { if payload, ok := m["textPayload"]; ok {
// use fmt.Printf here instead of log.Printf to avoid the time and code location info the log package provides
fmt.Printf("%s", payload) fmt.Printf("%s", payload)
} else { } else {
log.Printf("the log entry does not have the `textPayload` field: %s\n", line) log.Printf("the log entry does not have the `textPayload` field: %s\n", line)

View File

@@ -2,5 +2,4 @@ configmapGenerator:
- name: elasticsearch-config - name: elasticsearch-config
literals: literals:
- es-url="http://esbasic-master:9200" - es-url="http://esbasic-master:9200"
- kustomize-index-name="kustomize"
- plugin-index-name="plugin" - plugin-index-name="plugin"

View File

@@ -1,4 +1,4 @@
There are three ways of running the crawler job. The crawler job can run in one of the following mode:
# Crawling all the documents in the index and crawling all the kustomization files on Github # Crawling all the documents in the index and crawling all the kustomization files on Github
@@ -7,14 +7,13 @@ of the container should be:
``` ```
command: ["/crawler"] command: ["/crawler"]
args: []
``` ```
Or Or
``` ```
command: ["/crawler"] command: ["/crawler"]
args: [""] args: ["--mode=index+github"]
``` ```
# Crawling all the documents in the index # Crawling all the documents in the index
@@ -23,7 +22,7 @@ The `command` and `args` field of the container should be:
``` ```
command: ["/crawler"] command: ["/crawler"]
args: ["index"] args: ["--mode=index"]
``` ```
# Crawling all the kustomization files on Github # Crawling all the kustomization files on Github
@@ -32,7 +31,7 @@ The `command` and `args` field of the container should be:
``` ```
command: ["/crawler"] command: ["/crawler"]
args: ["github"] args: ["--mode=github"]
``` ```
# Crawling all the kustomization files in a Github repo # Crawling all the kustomization files in a Github repo
@@ -41,7 +40,7 @@ The `command` and `args` field of the container should be like:
``` ```
command: ["/crawler"] command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"] args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize"]
``` ```
# Crawling all the kustomization files in all the repositories of a Github user # Crawling all the kustomization files in all the repositories of a Github user
@@ -50,5 +49,5 @@ The `command` and `args` field of the container should be like:
``` ```
command: ["/crawler"] command: ["/crawler"]
args: ["github-user", "kubernetes-sigs"] args: ["--github-user", "--github-user=kubernetes-sigs"]
``` ```

View File

@@ -11,7 +11,7 @@ spec:
image: gcr.io/haiyanmeng-gke-dev/crawler:v1 image: gcr.io/haiyanmeng-gke-dev/crawler:v1
imagePullPolicy: Always imagePullPolicy: Always
command: ["/crawler"] command: ["/crawler"]
args: ["github-repo", "kubernetes-sigs/kustomize"] args: ["--mode=github-repo", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"]
env: env:
- name: GITHUB_ACCESS_TOKEN - name: GITHUB_ACCESS_TOKEN
valueFrom: valueFrom:

View File

@@ -4,6 +4,7 @@ import (
"context" "context"
"errors" "errors"
"fmt" "fmt"
"log"
"reflect" "reflect"
"sort" "sort"
"strings" "strings"
@@ -110,7 +111,7 @@ func (s sortableDocs) Len() int {
} }
func TestCrawlGithubRunner(t *testing.T) { func TestCrawlGithubRunner(t *testing.T) {
fmt.Println("testing CrawlGithubRunner") log.Println("testing CrawlGithubRunner")
tests := []struct { tests := []struct {
tc []Crawler tc []Crawler
errs []error errs []error
@@ -216,7 +217,7 @@ func TestCrawlGithubRunner(t *testing.T) {
} }
func TestCrawlFromSeed(t *testing.T) { func TestCrawlFromSeed(t *testing.T) {
fmt.Println("testing CrawlFromSeed") log.Println("testing CrawlFromSeed")
tests := []struct { tests := []struct {
seed CrawlSeed seed CrawlSeed

View File

@@ -2,6 +2,7 @@ package github
import ( import (
"fmt" "fmt"
"log"
"reflect" "reflect"
"testing" "testing"
) )
@@ -11,7 +12,7 @@ type testCachedSearch struct {
} }
func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) {
fmt.Printf("CountResults(%05x)\n", upperBound) log.Printf("CountResults(%05x)\n", upperBound)
count, ok := c.cache[upperBound] count, ok := c.cache[upperBound]
if !ok { if !ok {
return count, fmt.Errorf("cache not set at %x", upperBound) return count, fmt.Errorf("cache not set at %x", upperBound)

View File

@@ -2,6 +2,7 @@ package doc
import ( import (
"fmt" "fmt"
"log"
"sort" "sort"
"strings" "strings"
@@ -83,7 +84,7 @@ func (doc *KustomizationDocument) GetResources() ([]*Document, error) {
} }
next, err := doc.Document.FromRelativePath(r) next, err := doc.Document.FromRelativePath(r)
if err != nil { if err != nil {
fmt.Printf("GetResources error: %v\n", err) log.Printf("GetResources error: %v\n", err)
continue continue
} }
res = append(res, &next) res = append(res, &next)

View File

@@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"log"
"strings" "strings"
"time" "time"
@@ -97,14 +98,14 @@ type KustomizeIndex struct {
} }
// Create index reference to the index containing the kustomize documents. // Create index reference to the index containing the kustomize documents.
func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) { func NewKustomizeIndex(ctx context.Context, indexName string) (*KustomizeIndex, error) {
idx, err := newIndex(ctx, "kustomize") idx, err := newIndex(ctx, indexName)
if err != nil { if err != nil {
return nil, err return nil, err
} }
indicesExistsOp := idx.client.Indices.Exists indicesExistsOp := idx.client.Indices.Exists
resp, err := indicesExistsOp([]string{"kustomize"}, resp, err := indicesExistsOp([]string{indexName},
indicesExistsOp.WithContext(idx.ctx), indicesExistsOp.WithContext(idx.ctx),
indicesExistsOp.WithPretty()) indicesExistsOp.WithPretty())
if err != nil { if err != nil {
@@ -112,9 +113,9 @@ func NewKustomizeIndex(ctx context.Context) (*KustomizeIndex, error) {
} }
if resp.StatusCode == 200 { if resp.StatusCode == 200 {
fmt.Printf("The kustomize index already exists\n") log.Printf("The %s index already exists", indexName)
} else { } else {
fmt.Printf("Creating the kustomize index\n") log.Printf("Creating the %s index\n", indexName)
if err := idx.CreateIndex([]byte(IndexConfig)); err != nil { if err := idx.CreateIndex([]byte(IndexConfig)); err != nil {
return nil, err return nil, err
} }
@@ -252,7 +253,7 @@ func (it *KustomizeIterator) Next() bool {
} }
if it.err == nil { if it.err == nil {
fmt.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID) log.Printf("updating scroll: %s\n", *it.scrollImpl.ScrollID)
it.err = it.update(*it.scrollImpl.ScrollID, reader) it.err = it.update(*it.scrollImpl.ScrollID, reader)
} }
@@ -341,7 +342,7 @@ func (ki *KustomizeIndex) Search(query string,
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to format query %s", query) return nil, fmt.Errorf("failed to format query %s", query)
} }
fmt.Printf("formated query: %s\n", data) log.Printf("formated query: %s\n", data)
var kr ElasticKustomizeResult var kr ElasticKustomizeResult
err = ki.index.Search(data, opts.SearchOptions, func(results io.Reader) error { err = ki.index.Search(data, opts.SearchOptions, func(results io.Reader) error {