Make the crawler work

1) add the crawler binary and fix the crawler library
2) remove the readiness probe in the search backend
3) add config for redis keystore
4) add github_api_secret.txt file with instructions
This commit is contained in:
Haiyan Meng
2019-11-26 09:50:51 -08:00
parent 73fb32c85a
commit 84b75afae4
8 changed files with 149 additions and 29 deletions

View File

@@ -0,0 +1,15 @@
FROM golang:1.11 AS build
ARG GO111MODULE=on
WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl
COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl
RUN go mod download
RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go
FROM scratch
COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=build /go/bin/crawler /
ENTRYPOINT ["/crawler"]
CMD []

View File

@@ -0,0 +1,99 @@
package main
import (
"context"
"fmt"
"net/http"
"os"
"time"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/crawl/index"
"github.com/gomodule/redigo/redis"
)
const (
githubAccessTokenVar = "GITHUB_ACCESS_TOKEN"
redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL"
retryCount = 3
)
func main() {
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar)
return
}
ctx := context.Background()
idx, err := index.NewKustomizeIndex(ctx)
if err != nil {
fmt.Printf("Could not create an index: %v\n", err)
return
}
cacheURL := os.Getenv(redisCacheURL)
keystoreURL := os.Getenv(redisKeyURL)
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
docs := make(crawler.CrawlSeed, 0)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
docs = append(docs, hit.Document.GetDocument())
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
}
cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{}
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
} else {
clientCache = httpclient.NewClient(cache)
}
_, err = redis.DialURL(keystoreURL)
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
os.Exit(1)
}
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler},
// Converter takes in a plain document and processes it for the
// index.
func(d *doc.Document) (crawler.CrawledDocument, error) {
kdoc := doc.KustomizationDocument{
Document: *d,
}
err := kdoc.ParseYAML()
return &kdoc, err
},
// IndexFunc updates the value in the index.
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
switch d := cdoc.(type) {
case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d)
_, err := idx.Put("", d)
return err
default:
return fmt.Errorf("Type %T not supported", d)
}
},
)
}

View File

@@ -0,0 +1,2 @@
<ADD YOUR GITHUB PERSONAL ACCESS TOKEN HERE WITHOUT A TRAILING NEWLINE>
Run: printf "<your-token>" > github_api_secret.txt

View File

@@ -5,7 +5,9 @@ configmapGenerator:
- name: crawler-http-cache
literals:
- redis-cache-url="redis://redis-http-cache:6379"
- name: redis-keystore
literals:
- keystore-url="redis://redis-docs-keystore:6379"
secretGenerator:
- name: github-access-token

View File

@@ -21,10 +21,6 @@ spec:
httpGet:
path: /liveness
port: backend-port
readinessProbe:
httpGet:
path: /readiness
port: backend-port
ports:
- name: backend-port
containerPort: 8080

View File

@@ -16,11 +16,11 @@ import (
"strings"
"time"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
)
var logger = log.New(os.Stdout, "Github Crawler: ",
@@ -36,9 +36,9 @@ type GhClient struct {
RequestConfig
retryCount uint64
client *http.Client
accessToken string
}
/*
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler {
@@ -48,13 +48,12 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
client: client,
RequestConfig: RequestConfig{
perPage: githubMaxPageSize,
accessToken: accessToken,
},
accessToken: accessToken,
},
query: query,
}
}
*/
// Implements crawler.Crawler.
func (gc githubCrawler) Crawl(
@@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl(
RequestConfig: gc.client.RequestConfig,
client: &http.Client{Timeout: gc.client.client.Timeout},
retryCount: gc.client.retryCount,
accessToken: gc.client.accessToken,
}
// Since Github returns a max of 1000 results per query, we can use
@@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro
continue
}
}
return fmt.Errorf("file not found: %s", url)
return fmt.Errorf("file not found: %s, error: %v", url, err)
}
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
@@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) {
return gcl.getWithRetry(query)
}
func (gcl GhClient) Do(query string) (*http.Response, error) {
req, err := http.NewRequest("GET", query, nil)
if err != nil {
return nil, err
}
req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken))
return gcl.client.Do(req)
}
func (gcl GhClient) getWithRetry(
query string) (resp *http.Response, err error) {
resp, err = gcl.client.Get(query)
resp, err = gcl.Do(query)
retryCount := gcl.retryCount
for err == nil &&
@@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry(
logger.Printf("waiting %d seconds before retrying\n", i)
time.Sleep(time.Second * time.Duration(i))
retryCount--
resp, err = gcl.client.Get(query)
resp, err = gcl.Do(query)
}
if err != nil {

View File

@@ -11,6 +11,8 @@ const (
accessTokenArg = "access_token"
)
const githubMaxPageSize = 100
// Implementation detail, not important to external API.
type queryField struct {
name string
@@ -97,13 +99,11 @@ func Path(p string) queryField {
// determine the date of a file.
type RequestConfig struct {
perPage uint64
accessToken string
}
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
func NewRequestConfig(perPage uint64) RequestConfig {
return RequestConfig{
perPage: perPage,
accessToken: accessToken,
}
}
@@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
return request{
@@ -183,7 +180,7 @@ func (r request) URL() string {
if encoded == "" && query != "" {
sep = "?"
}
r.url.RawQuery = encoded + sep + query
r.url.RawQuery = query + sep + encoded
return r.url.String()
}

View File

@@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) {
{
rc: RequestConfig{
perPage: perPage,
accessToken: accessToken,
},
codeQuery: Query{
Filename("kustomization.yaml"),
@@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) {
path: "examples/helloWorld/kustomization.yaml",
expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
"q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed",
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
"examples/helloWorld/kustomization.yaml?per_page=100",
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
"q=path:examples/helloWorld/kustomization.yaml&per_page=100",
},
}