From 84b75afae47fe7ac5069b847c9d7b9d71d5c90ad Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 26 Nov 2019 09:50:51 -0800 Subject: [PATCH] Make the crawler work 1) add the crawler binary and fix the crawler library 2) remove the readiness probe in the search backend 3) add config for redis keystore 4) add github_api_secret.txt file with instructions --- api/internal/crawl/cmd/crawler/Dockerfile | 15 +++ api/internal/crawl/cmd/crawler/crawler.go | 99 +++++++++++++++++++ .../config/crawler/base/github_api_secret.txt | 2 + .../config/crawler/base/kustomization.yaml | 4 +- .../config/webapp/backend/deployment.yaml | 4 - api/internal/crawl/crawler/github/crawler.go | 32 +++--- api/internal/crawl/crawler/github/queries.go | 15 ++- .../crawl/crawler/github/queries_test.go | 7 +- 8 files changed, 149 insertions(+), 29 deletions(-) create mode 100644 api/internal/crawl/cmd/crawler/Dockerfile create mode 100644 api/internal/crawl/cmd/crawler/crawler.go create mode 100644 api/internal/crawl/config/crawler/base/github_api_secret.txt diff --git a/api/internal/crawl/cmd/crawler/Dockerfile b/api/internal/crawl/cmd/crawler/Dockerfile new file mode 100644 index 000000000..4a8ab957b --- /dev/null +++ b/api/internal/crawl/cmd/crawler/Dockerfile @@ -0,0 +1,15 @@ +FROM golang:1.11 AS build + +ARG GO111MODULE=on + +WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl +COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl + +RUN go mod download +RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go + +FROM scratch +COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ +COPY --from=build /go/bin/crawler / +ENTRYPOINT ["/crawler"] +CMD [] diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go new file mode 100644 index 000000000..4212d9f19 --- /dev/null +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -0,0 +1,99 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "os" + "time" + + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" + "sigs.k8s.io/kustomize/api/internal/crawl/doc" + "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/crawl/index" + + "github.com/gomodule/redigo/redis" +) + +const ( + githubAccessTokenVar = "GITHUB_ACCESS_TOKEN" + redisCacheURL = "REDIS_CACHE_URL" + redisKeyURL = "REDIS_KEY_URL" + retryCount = 3 +) + +func main() { + githubToken := os.Getenv(githubAccessTokenVar) + if githubToken == "" { + fmt.Printf("Must set the variable '%s' to make github requests.\n", + githubAccessTokenVar) + return + } + + ctx := context.Background() + idx, err := index.NewKustomizeIndex(ctx) + if err != nil { + fmt.Printf("Could not create an index: %v\n", err) + return + } + + cacheURL := os.Getenv(redisCacheURL) + keystoreURL := os.Getenv(redisKeyURL) + + query := []byte(`{ "query":{ "match_all":{} } }`) + it := idx.IterateQuery(query, 10000, 60*time.Second) + docs := make(crawler.CrawlSeed, 0) + for it.Next() { + for _, hit := range it.Value().Hits.Hits { + docs = append(docs, hit.Document.GetDocument()) + } + } + if err := it.Err(); err != nil { + fmt.Printf("Error iterating: %v\n", err) + } + + cache, err := redis.DialURL(cacheURL) + clientCache := &http.Client{} + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + } else { + clientCache = httpclient.NewClient(cache) + } + + _, err = redis.DialURL(keystoreURL) + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + os.Exit(1) + } + + ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + + crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler}, + // Converter takes in a plain document and processes it for the + // index. + func(d *doc.Document) (crawler.CrawledDocument, error) { + kdoc := doc.KustomizationDocument{ + Document: *d, + } + + err := kdoc.ParseYAML() + return &kdoc, err + }, + // IndexFunc updates the value in the index. + func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { + switch d := cdoc.(type) { + case *doc.KustomizationDocument: + fmt.Println("Inserting: ", d) + _, err := idx.Put("", d) + return err + default: + return fmt.Errorf("Type %T not supported", d) + } + }, + ) +} diff --git a/api/internal/crawl/config/crawler/base/github_api_secret.txt b/api/internal/crawl/config/crawler/base/github_api_secret.txt new file mode 100644 index 000000000..2bea8de3e --- /dev/null +++ b/api/internal/crawl/config/crawler/base/github_api_secret.txt @@ -0,0 +1,2 @@ + +Run: printf "" > github_api_secret.txt diff --git a/api/internal/crawl/config/crawler/base/kustomization.yaml b/api/internal/crawl/config/crawler/base/kustomization.yaml index f7cee507a..2f2216308 100644 --- a/api/internal/crawl/config/crawler/base/kustomization.yaml +++ b/api/internal/crawl/config/crawler/base/kustomization.yaml @@ -5,7 +5,9 @@ configmapGenerator: - name: crawler-http-cache literals: - redis-cache-url="redis://redis-http-cache:6379" - +- name: redis-keystore + literals: + - keystore-url="redis://redis-docs-keystore:6379" secretGenerator: - name: github-access-token diff --git a/api/internal/crawl/config/webapp/backend/deployment.yaml b/api/internal/crawl/config/webapp/backend/deployment.yaml index d6fab166e..e533cc227 100644 --- a/api/internal/crawl/config/webapp/backend/deployment.yaml +++ b/api/internal/crawl/config/webapp/backend/deployment.yaml @@ -21,10 +21,6 @@ spec: httpGet: path: /liveness port: backend-port - readinessProbe: - httpGet: - path: /readiness - port: backend-port ports: - name: backend-port containerPort: 8080 diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 2e4060994..fa26d1404 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -16,11 +16,11 @@ import ( "strings" "time" - "sigs.k8s.io/kustomize/api/internal/git" - "sigs.k8s.io/kustomize/api/konfig" "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/git" + "sigs.k8s.io/kustomize/api/konfig" ) var logger = log.New(os.Stdout, "Github Crawler: ", @@ -34,11 +34,11 @@ type githubCrawler struct { type GhClient struct { RequestConfig - retryCount uint64 - client *http.Client + retryCount uint64 + client *http.Client + accessToken string } -/* func NewCrawler(accessToken string, retryCount uint64, client *http.Client, query Query) githubCrawler { @@ -47,14 +47,13 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client, retryCount: retryCount, client: client, RequestConfig: RequestConfig{ - perPage: githubMaxPageSize, - accessToken: accessToken, + perPage: githubMaxPageSize, }, + accessToken: accessToken, }, query: query, } } -*/ // Implements crawler.Crawler. func (gc githubCrawler) Crawl( @@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl( RequestConfig: gc.client.RequestConfig, client: &http.Client{Timeout: gc.client.client.Timeout}, retryCount: gc.client.retryCount, + accessToken: gc.client.accessToken, } // Since Github returns a max of 1000 results per query, we can use @@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro continue } } - return fmt.Errorf("file not found: %s", url) + return fmt.Errorf("file not found: %s, error: %v", url, err) } func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error { @@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) { return gcl.getWithRetry(query) } +func (gcl GhClient) Do(query string) (*http.Response, error) { + req, err := http.NewRequest("GET", query, nil) + if err != nil { + return nil, err + } + req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken)) + return gcl.client.Do(req) +} + func (gcl GhClient) getWithRetry( query string) (resp *http.Response, err error) { - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) + retryCount := gcl.retryCount for err == nil && @@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry( logger.Printf("waiting %d seconds before retrying\n", i) time.Sleep(time.Second * time.Duration(i)) retryCount-- - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) } if err != nil { diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 3f6ca5d88..4c41b0fa8 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -11,6 +11,8 @@ const ( accessTokenArg = "access_token" ) +const githubMaxPageSize = 100 + // Implementation detail, not important to external API. type queryField struct { name string @@ -96,14 +98,12 @@ func Path(p string) queryField { // - CommitsRequests: asks Github to list commits made one a file. Useful to // determine the date of a file. type RequestConfig struct { - perPage uint64 - accessToken string + perPage uint64 } -func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { +func NewRequestConfig(perPage uint64) RequestConfig { return RequestConfig{ - perPage: perPage, - accessToken: accessToken, + perPage: perPage, } } @@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { func (rc RequestConfig) makeRequest(path string, query Query) request { vals := url.Values{} - if rc.accessToken != "" { - vals.Set(accessTokenArg, rc.accessToken) - } vals.Set(perPageArg, fmt.Sprint(rc.perPage)) return request{ @@ -183,7 +180,7 @@ func (r request) URL() string { if encoded == "" && query != "" { sep = "?" } - r.url.RawQuery = encoded + sep + query + r.url.RawQuery = query + sep + encoded return r.url.String() } diff --git a/api/internal/crawl/crawler/github/queries_test.go b/api/internal/crawl/crawler/github/queries_test.go index 98ef7d564..db8fe571c 100644 --- a/api/internal/crawl/crawler/github/queries_test.go +++ b/api/internal/crawl/crawler/github/queries_test.go @@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) { { rc: RequestConfig{ perPage: perPage, - accessToken: accessToken, }, codeQuery: Query{ Filename("kustomization.yaml"), @@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) { path: "examples/helloWorld/kustomization.yaml", expectedCodeQuery: "https://api.github.com/search/code?" + - "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + "q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed", expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + - "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + "examples/helloWorld/kustomization.yaml?per_page=100", expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + - "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + "q=path:examples/helloWorld/kustomization.yaml&per_page=100", }, }