From 84b75afae47fe7ac5069b847c9d7b9d71d5c90ad Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 26 Nov 2019 09:50:51 -0800 Subject: [PATCH 1/3] Make the crawler work 1) add the crawler binary and fix the crawler library 2) remove the readiness probe in the search backend 3) add config for redis keystore 4) add github_api_secret.txt file with instructions --- api/internal/crawl/cmd/crawler/Dockerfile | 15 +++ api/internal/crawl/cmd/crawler/crawler.go | 99 +++++++++++++++++++ .../config/crawler/base/github_api_secret.txt | 2 + .../config/crawler/base/kustomization.yaml | 4 +- .../config/webapp/backend/deployment.yaml | 4 - api/internal/crawl/crawler/github/crawler.go | 32 +++--- api/internal/crawl/crawler/github/queries.go | 15 ++- .../crawl/crawler/github/queries_test.go | 7 +- 8 files changed, 149 insertions(+), 29 deletions(-) create mode 100644 api/internal/crawl/cmd/crawler/Dockerfile create mode 100644 api/internal/crawl/cmd/crawler/crawler.go create mode 100644 api/internal/crawl/config/crawler/base/github_api_secret.txt diff --git a/api/internal/crawl/cmd/crawler/Dockerfile b/api/internal/crawl/cmd/crawler/Dockerfile new file mode 100644 index 000000000..4a8ab957b --- /dev/null +++ b/api/internal/crawl/cmd/crawler/Dockerfile @@ -0,0 +1,15 @@ +FROM golang:1.11 AS build + +ARG GO111MODULE=on + +WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl +COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl + +RUN go mod download +RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go + +FROM scratch +COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ +COPY --from=build /go/bin/crawler / +ENTRYPOINT ["/crawler"] +CMD [] diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go new file mode 100644 index 000000000..4212d9f19 --- /dev/null +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -0,0 +1,99 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "os" + "time" + + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" + "sigs.k8s.io/kustomize/api/internal/crawl/doc" + "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/crawl/index" + + "github.com/gomodule/redigo/redis" +) + +const ( + githubAccessTokenVar = "GITHUB_ACCESS_TOKEN" + redisCacheURL = "REDIS_CACHE_URL" + redisKeyURL = "REDIS_KEY_URL" + retryCount = 3 +) + +func main() { + githubToken := os.Getenv(githubAccessTokenVar) + if githubToken == "" { + fmt.Printf("Must set the variable '%s' to make github requests.\n", + githubAccessTokenVar) + return + } + + ctx := context.Background() + idx, err := index.NewKustomizeIndex(ctx) + if err != nil { + fmt.Printf("Could not create an index: %v\n", err) + return + } + + cacheURL := os.Getenv(redisCacheURL) + keystoreURL := os.Getenv(redisKeyURL) + + query := []byte(`{ "query":{ "match_all":{} } }`) + it := idx.IterateQuery(query, 10000, 60*time.Second) + docs := make(crawler.CrawlSeed, 0) + for it.Next() { + for _, hit := range it.Value().Hits.Hits { + docs = append(docs, hit.Document.GetDocument()) + } + } + if err := it.Err(); err != nil { + fmt.Printf("Error iterating: %v\n", err) + } + + cache, err := redis.DialURL(cacheURL) + clientCache := &http.Client{} + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + } else { + clientCache = httpclient.NewClient(cache) + } + + _, err = redis.DialURL(keystoreURL) + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + os.Exit(1) + } + + ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + + crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler}, + // Converter takes in a plain document and processes it for the + // index. + func(d *doc.Document) (crawler.CrawledDocument, error) { + kdoc := doc.KustomizationDocument{ + Document: *d, + } + + err := kdoc.ParseYAML() + return &kdoc, err + }, + // IndexFunc updates the value in the index. + func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { + switch d := cdoc.(type) { + case *doc.KustomizationDocument: + fmt.Println("Inserting: ", d) + _, err := idx.Put("", d) + return err + default: + return fmt.Errorf("Type %T not supported", d) + } + }, + ) +} diff --git a/api/internal/crawl/config/crawler/base/github_api_secret.txt b/api/internal/crawl/config/crawler/base/github_api_secret.txt new file mode 100644 index 000000000..2bea8de3e --- /dev/null +++ b/api/internal/crawl/config/crawler/base/github_api_secret.txt @@ -0,0 +1,2 @@ + +Run: printf "" > github_api_secret.txt diff --git a/api/internal/crawl/config/crawler/base/kustomization.yaml b/api/internal/crawl/config/crawler/base/kustomization.yaml index f7cee507a..2f2216308 100644 --- a/api/internal/crawl/config/crawler/base/kustomization.yaml +++ b/api/internal/crawl/config/crawler/base/kustomization.yaml @@ -5,7 +5,9 @@ configmapGenerator: - name: crawler-http-cache literals: - redis-cache-url="redis://redis-http-cache:6379" - +- name: redis-keystore + literals: + - keystore-url="redis://redis-docs-keystore:6379" secretGenerator: - name: github-access-token diff --git a/api/internal/crawl/config/webapp/backend/deployment.yaml b/api/internal/crawl/config/webapp/backend/deployment.yaml index d6fab166e..e533cc227 100644 --- a/api/internal/crawl/config/webapp/backend/deployment.yaml +++ b/api/internal/crawl/config/webapp/backend/deployment.yaml @@ -21,10 +21,6 @@ spec: httpGet: path: /liveness port: backend-port - readinessProbe: - httpGet: - path: /readiness - port: backend-port ports: - name: backend-port containerPort: 8080 diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 2e4060994..fa26d1404 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -16,11 +16,11 @@ import ( "strings" "time" - "sigs.k8s.io/kustomize/api/internal/git" - "sigs.k8s.io/kustomize/api/konfig" "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/git" + "sigs.k8s.io/kustomize/api/konfig" ) var logger = log.New(os.Stdout, "Github Crawler: ", @@ -34,11 +34,11 @@ type githubCrawler struct { type GhClient struct { RequestConfig - retryCount uint64 - client *http.Client + retryCount uint64 + client *http.Client + accessToken string } -/* func NewCrawler(accessToken string, retryCount uint64, client *http.Client, query Query) githubCrawler { @@ -47,14 +47,13 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client, retryCount: retryCount, client: client, RequestConfig: RequestConfig{ - perPage: githubMaxPageSize, - accessToken: accessToken, + perPage: githubMaxPageSize, }, + accessToken: accessToken, }, query: query, } } -*/ // Implements crawler.Crawler. func (gc githubCrawler) Crawl( @@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl( RequestConfig: gc.client.RequestConfig, client: &http.Client{Timeout: gc.client.client.Timeout}, retryCount: gc.client.retryCount, + accessToken: gc.client.accessToken, } // Since Github returns a max of 1000 results per query, we can use @@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro continue } } - return fmt.Errorf("file not found: %s", url) + return fmt.Errorf("file not found: %s, error: %v", url, err) } func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error { @@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) { return gcl.getWithRetry(query) } +func (gcl GhClient) Do(query string) (*http.Response, error) { + req, err := http.NewRequest("GET", query, nil) + if err != nil { + return nil, err + } + req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken)) + return gcl.client.Do(req) +} + func (gcl GhClient) getWithRetry( query string) (resp *http.Response, err error) { - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) + retryCount := gcl.retryCount for err == nil && @@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry( logger.Printf("waiting %d seconds before retrying\n", i) time.Sleep(time.Second * time.Duration(i)) retryCount-- - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) } if err != nil { diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 3f6ca5d88..4c41b0fa8 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -11,6 +11,8 @@ const ( accessTokenArg = "access_token" ) +const githubMaxPageSize = 100 + // Implementation detail, not important to external API. type queryField struct { name string @@ -96,14 +98,12 @@ func Path(p string) queryField { // - CommitsRequests: asks Github to list commits made one a file. Useful to // determine the date of a file. type RequestConfig struct { - perPage uint64 - accessToken string + perPage uint64 } -func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { +func NewRequestConfig(perPage uint64) RequestConfig { return RequestConfig{ - perPage: perPage, - accessToken: accessToken, + perPage: perPage, } } @@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { func (rc RequestConfig) makeRequest(path string, query Query) request { vals := url.Values{} - if rc.accessToken != "" { - vals.Set(accessTokenArg, rc.accessToken) - } vals.Set(perPageArg, fmt.Sprint(rc.perPage)) return request{ @@ -183,7 +180,7 @@ func (r request) URL() string { if encoded == "" && query != "" { sep = "?" } - r.url.RawQuery = encoded + sep + query + r.url.RawQuery = query + sep + encoded return r.url.String() } diff --git a/api/internal/crawl/crawler/github/queries_test.go b/api/internal/crawl/crawler/github/queries_test.go index 98ef7d564..db8fe571c 100644 --- a/api/internal/crawl/crawler/github/queries_test.go +++ b/api/internal/crawl/crawler/github/queries_test.go @@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) { { rc: RequestConfig{ perPage: perPage, - accessToken: accessToken, }, codeQuery: Query{ Filename("kustomization.yaml"), @@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) { path: "examples/helloWorld/kustomization.yaml", expectedCodeQuery: "https://api.github.com/search/code?" + - "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + "q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed", expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + - "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + "examples/helloWorld/kustomization.yaml?per_page=100", expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + - "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + "q=path:examples/helloWorld/kustomization.yaml&per_page=100", }, } From 31c5e89b1fe204731c0d83251cf89a6f738e5e8c Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 26 Nov 2019 14:46:33 -0800 Subject: [PATCH 2/3] Add `String` method to KustomizationDocument to avoid printing the content of kustomization.yaml --- api/internal/crawl/doc/doc.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 5e59dd543..98a75c457 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -43,6 +43,12 @@ type KustomizationDocument struct { type set map[string]struct{} +func (doc *KustomizationDocument) String() string { + return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath, + doc.DefaultBranch, doc.CreationTime, doc.IsSame, + doc.Kinds, doc.Identifiers, doc.Values) +} + // Implements the CrawlerDocument interface. func (doc *KustomizationDocument) GetResources() ([]*Document, error) { isResource := true From 9bba761a14bfd905098815cf0778471b3709f2f4 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 26 Nov 2019 19:38:17 -0800 Subject: [PATCH 3/3] Add config for creating an ElasticSearch Cluster --- .../crawl/config/elastic/escluster.yaml | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 api/internal/crawl/config/elastic/escluster.yaml diff --git a/api/internal/crawl/config/elastic/escluster.yaml b/api/internal/crawl/config/elastic/escluster.yaml new file mode 100644 index 000000000..47e1f5458 --- /dev/null +++ b/api/internal/crawl/config/elastic/escluster.yaml @@ -0,0 +1,43 @@ +apiVersion: elasticsearch.cloud.google.com/v1alpha1 +kind: ESCluster +metadata: + name: esbasic +spec: + plugin: + pluginList: + - repository-gcs + - ingest-user-agent + - ingest-geoip + config: + env: + example: test + nodegroups: + - name: di + replicas: 2 + data: true + ingest: true + config: + jvm: + - Djava.net.preferIPv4Stack=true + - Xms2g + - Xmx2g + es: + path.repo: '["/tmp/es_backup_basic"]' + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - topologyKey: kubernetes.io/hostname + labelSelector: + matchLabels: + es/nodegroup: di + resources: + requests: + memory: 3Gi + limits: + memory: 3Gi + - name: m + replicas: 2 + master: true + config: + es: + path.repo: '["/tmp/es_backup_basic"]'