diff --git a/api/internal/crawl/cmd/crawler/Dockerfile b/api/internal/crawl/cmd/crawler/Dockerfile new file mode 100644 index 000000000..4a8ab957b --- /dev/null +++ b/api/internal/crawl/cmd/crawler/Dockerfile @@ -0,0 +1,15 @@ +FROM golang:1.11 AS build + +ARG GO111MODULE=on + +WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl +COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl + +RUN go mod download +RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go + +FROM scratch +COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ +COPY --from=build /go/bin/crawler / +ENTRYPOINT ["/crawler"] +CMD [] diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go new file mode 100644 index 000000000..4212d9f19 --- /dev/null +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -0,0 +1,99 @@ +package main + +import ( + "context" + "fmt" + "net/http" + "os" + "time" + + "sigs.k8s.io/kustomize/api/internal/crawl/crawler" + "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" + "sigs.k8s.io/kustomize/api/internal/crawl/doc" + "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/crawl/index" + + "github.com/gomodule/redigo/redis" +) + +const ( + githubAccessTokenVar = "GITHUB_ACCESS_TOKEN" + redisCacheURL = "REDIS_CACHE_URL" + redisKeyURL = "REDIS_KEY_URL" + retryCount = 3 +) + +func main() { + githubToken := os.Getenv(githubAccessTokenVar) + if githubToken == "" { + fmt.Printf("Must set the variable '%s' to make github requests.\n", + githubAccessTokenVar) + return + } + + ctx := context.Background() + idx, err := index.NewKustomizeIndex(ctx) + if err != nil { + fmt.Printf("Could not create an index: %v\n", err) + return + } + + cacheURL := os.Getenv(redisCacheURL) + keystoreURL := os.Getenv(redisKeyURL) + + query := []byte(`{ "query":{ "match_all":{} } }`) + it := idx.IterateQuery(query, 10000, 60*time.Second) + docs := make(crawler.CrawlSeed, 0) + for it.Next() { + for _, hit := range it.Value().Hits.Hits { + docs = append(docs, hit.Document.GetDocument()) + } + } + if err := it.Err(); err != nil { + fmt.Printf("Error iterating: %v\n", err) + } + + cache, err := redis.DialURL(cacheURL) + clientCache := &http.Client{} + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + } else { + clientCache = httpclient.NewClient(cache) + } + + _, err = redis.DialURL(keystoreURL) + if err != nil { + fmt.Printf("Error: redis could not make a connection: %v\n", err) + os.Exit(1) + } + + ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + + crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler}, + // Converter takes in a plain document and processes it for the + // index. + func(d *doc.Document) (crawler.CrawledDocument, error) { + kdoc := doc.KustomizationDocument{ + Document: *d, + } + + err := kdoc.ParseYAML() + return &kdoc, err + }, + // IndexFunc updates the value in the index. + func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { + switch d := cdoc.(type) { + case *doc.KustomizationDocument: + fmt.Println("Inserting: ", d) + _, err := idx.Put("", d) + return err + default: + return fmt.Errorf("Type %T not supported", d) + } + }, + ) +} diff --git a/api/internal/crawl/config/crawler/base/github_api_secret.txt b/api/internal/crawl/config/crawler/base/github_api_secret.txt new file mode 100644 index 000000000..2bea8de3e --- /dev/null +++ b/api/internal/crawl/config/crawler/base/github_api_secret.txt @@ -0,0 +1,2 @@ + +Run: printf "" > github_api_secret.txt diff --git a/api/internal/crawl/config/crawler/base/kustomization.yaml b/api/internal/crawl/config/crawler/base/kustomization.yaml index f7cee507a..2f2216308 100644 --- a/api/internal/crawl/config/crawler/base/kustomization.yaml +++ b/api/internal/crawl/config/crawler/base/kustomization.yaml @@ -5,7 +5,9 @@ configmapGenerator: - name: crawler-http-cache literals: - redis-cache-url="redis://redis-http-cache:6379" - +- name: redis-keystore + literals: + - keystore-url="redis://redis-docs-keystore:6379" secretGenerator: - name: github-access-token diff --git a/api/internal/crawl/config/webapp/backend/deployment.yaml b/api/internal/crawl/config/webapp/backend/deployment.yaml index d6fab166e..e533cc227 100644 --- a/api/internal/crawl/config/webapp/backend/deployment.yaml +++ b/api/internal/crawl/config/webapp/backend/deployment.yaml @@ -21,10 +21,6 @@ spec: httpGet: path: /liveness port: backend-port - readinessProbe: - httpGet: - path: /readiness - port: backend-port ports: - name: backend-port containerPort: 8080 diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 2e4060994..fa26d1404 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -16,11 +16,11 @@ import ( "strings" "time" - "sigs.k8s.io/kustomize/api/internal/git" - "sigs.k8s.io/kustomize/api/konfig" "sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient" + "sigs.k8s.io/kustomize/api/internal/git" + "sigs.k8s.io/kustomize/api/konfig" ) var logger = log.New(os.Stdout, "Github Crawler: ", @@ -34,11 +34,11 @@ type githubCrawler struct { type GhClient struct { RequestConfig - retryCount uint64 - client *http.Client + retryCount uint64 + client *http.Client + accessToken string } -/* func NewCrawler(accessToken string, retryCount uint64, client *http.Client, query Query) githubCrawler { @@ -47,14 +47,13 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client, retryCount: retryCount, client: client, RequestConfig: RequestConfig{ - perPage: githubMaxPageSize, - accessToken: accessToken, + perPage: githubMaxPageSize, }, + accessToken: accessToken, }, query: query, } } -*/ // Implements crawler.Crawler. func (gc githubCrawler) Crawl( @@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl( RequestConfig: gc.client.RequestConfig, client: &http.Client{Timeout: gc.client.client.Timeout}, retryCount: gc.client.retryCount, + accessToken: gc.client.accessToken, } // Since Github returns a max of 1000 results per query, we can use @@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro continue } } - return fmt.Errorf("file not found: %s", url) + return fmt.Errorf("file not found: %s, error: %v", url, err) } func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error { @@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) { return gcl.getWithRetry(query) } +func (gcl GhClient) Do(query string) (*http.Response, error) { + req, err := http.NewRequest("GET", query, nil) + if err != nil { + return nil, err + } + req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken)) + return gcl.client.Do(req) +} + func (gcl GhClient) getWithRetry( query string) (resp *http.Response, err error) { - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) + retryCount := gcl.retryCount for err == nil && @@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry( logger.Printf("waiting %d seconds before retrying\n", i) time.Sleep(time.Second * time.Duration(i)) retryCount-- - resp, err = gcl.client.Get(query) + resp, err = gcl.Do(query) } if err != nil { diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 3f6ca5d88..4c41b0fa8 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -11,6 +11,8 @@ const ( accessTokenArg = "access_token" ) +const githubMaxPageSize = 100 + // Implementation detail, not important to external API. type queryField struct { name string @@ -96,14 +98,12 @@ func Path(p string) queryField { // - CommitsRequests: asks Github to list commits made one a file. Useful to // determine the date of a file. type RequestConfig struct { - perPage uint64 - accessToken string + perPage uint64 } -func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { +func NewRequestConfig(perPage uint64) RequestConfig { return RequestConfig{ - perPage: perPage, - accessToken: accessToken, + perPage: perPage, } } @@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { func (rc RequestConfig) makeRequest(path string, query Query) request { vals := url.Values{} - if rc.accessToken != "" { - vals.Set(accessTokenArg, rc.accessToken) - } vals.Set(perPageArg, fmt.Sprint(rc.perPage)) return request{ @@ -183,7 +180,7 @@ func (r request) URL() string { if encoded == "" && query != "" { sep = "?" } - r.url.RawQuery = encoded + sep + query + r.url.RawQuery = query + sep + encoded return r.url.String() } diff --git a/api/internal/crawl/crawler/github/queries_test.go b/api/internal/crawl/crawler/github/queries_test.go index 98ef7d564..db8fe571c 100644 --- a/api/internal/crawl/crawler/github/queries_test.go +++ b/api/internal/crawl/crawler/github/queries_test.go @@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) { { rc: RequestConfig{ perPage: perPage, - accessToken: accessToken, }, codeQuery: Query{ Filename("kustomization.yaml"), @@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) { path: "examples/helloWorld/kustomization.yaml", expectedCodeQuery: "https://api.github.com/search/code?" + - "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + "q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed", expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + - "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + "examples/helloWorld/kustomization.yaml?per_page=100", expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + - "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + "q=path:examples/helloWorld/kustomization.yaml&per_page=100", }, }