Make the crawler work

1) add the crawler binary and fix the crawler library
2) remove the readiness probe in the search backend
3) add config for redis keystore
4) add github_api_secret.txt file with instructions
This commit is contained in:
Haiyan Meng
2019-11-26 09:50:51 -08:00
parent 73fb32c85a
commit 84b75afae4
8 changed files with 149 additions and 29 deletions

View File

@@ -0,0 +1,15 @@
FROM golang:1.11 AS build
ARG GO111MODULE=on
WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl
COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl
RUN go mod download
RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go
FROM scratch
COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=build /go/bin/crawler /
ENTRYPOINT ["/crawler"]
CMD []

View File

@@ -0,0 +1,99 @@
package main
import (
"context"
"fmt"
"net/http"
"os"
"time"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/crawl/index"
"github.com/gomodule/redigo/redis"
)
const (
githubAccessTokenVar = "GITHUB_ACCESS_TOKEN"
redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL"
retryCount = 3
)
func main() {
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar)
return
}
ctx := context.Background()
idx, err := index.NewKustomizeIndex(ctx)
if err != nil {
fmt.Printf("Could not create an index: %v\n", err)
return
}
cacheURL := os.Getenv(redisCacheURL)
keystoreURL := os.Getenv(redisKeyURL)
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
docs := make(crawler.CrawlSeed, 0)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
docs = append(docs, hit.Document.GetDocument())
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
}
cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{}
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
} else {
clientCache = httpclient.NewClient(cache)
}
_, err = redis.DialURL(keystoreURL)
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
os.Exit(1)
}
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler},
// Converter takes in a plain document and processes it for the
// index.
func(d *doc.Document) (crawler.CrawledDocument, error) {
kdoc := doc.KustomizationDocument{
Document: *d,
}
err := kdoc.ParseYAML()
return &kdoc, err
},
// IndexFunc updates the value in the index.
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
switch d := cdoc.(type) {
case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d)
_, err := idx.Put("", d)
return err
default:
return fmt.Errorf("Type %T not supported", d)
}
},
)
}

View File

@@ -0,0 +1,2 @@
<ADD YOUR GITHUB PERSONAL ACCESS TOKEN HERE WITHOUT A TRAILING NEWLINE>
Run: printf "<your-token>" > github_api_secret.txt

View File

@@ -5,7 +5,9 @@ configmapGenerator:
- name: crawler-http-cache - name: crawler-http-cache
literals: literals:
- redis-cache-url="redis://redis-http-cache:6379" - redis-cache-url="redis://redis-http-cache:6379"
- name: redis-keystore
literals:
- keystore-url="redis://redis-docs-keystore:6379"
secretGenerator: secretGenerator:
- name: github-access-token - name: github-access-token

View File

@@ -21,10 +21,6 @@ spec:
httpGet: httpGet:
path: /liveness path: /liveness
port: backend-port port: backend-port
readinessProbe:
httpGet:
path: /readiness
port: backend-port
ports: ports:
- name: backend-port - name: backend-port
containerPort: 8080 containerPort: 8080

View File

@@ -16,11 +16,11 @@ import (
"strings" "strings"
"time" "time"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
) )
var logger = log.New(os.Stdout, "Github Crawler: ", var logger = log.New(os.Stdout, "Github Crawler: ",
@@ -36,9 +36,9 @@ type GhClient struct {
RequestConfig RequestConfig
retryCount uint64 retryCount uint64
client *http.Client client *http.Client
accessToken string
} }
/*
func NewCrawler(accessToken string, retryCount uint64, client *http.Client, func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler { query Query) githubCrawler {
@@ -48,13 +48,12 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
client: client, client: client,
RequestConfig: RequestConfig{ RequestConfig: RequestConfig{
perPage: githubMaxPageSize, perPage: githubMaxPageSize,
accessToken: accessToken,
}, },
accessToken: accessToken,
}, },
query: query, query: query,
} }
} }
*/
// Implements crawler.Crawler. // Implements crawler.Crawler.
func (gc githubCrawler) Crawl( func (gc githubCrawler) Crawl(
@@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl(
RequestConfig: gc.client.RequestConfig, RequestConfig: gc.client.RequestConfig,
client: &http.Client{Timeout: gc.client.client.Timeout}, client: &http.Client{Timeout: gc.client.client.Timeout},
retryCount: gc.client.retryCount, retryCount: gc.client.retryCount,
accessToken: gc.client.accessToken,
} }
// Since Github returns a max of 1000 results per query, we can use // Since Github returns a max of 1000 results per query, we can use
@@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro
continue continue
} }
} }
return fmt.Errorf("file not found: %s", url) return fmt.Errorf("file not found: %s, error: %v", url, err)
} }
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error { func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
@@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) {
return gcl.getWithRetry(query) return gcl.getWithRetry(query)
} }
func (gcl GhClient) Do(query string) (*http.Response, error) {
req, err := http.NewRequest("GET", query, nil)
if err != nil {
return nil, err
}
req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken))
return gcl.client.Do(req)
}
func (gcl GhClient) getWithRetry( func (gcl GhClient) getWithRetry(
query string) (resp *http.Response, err error) { query string) (resp *http.Response, err error) {
resp, err = gcl.client.Get(query) resp, err = gcl.Do(query)
retryCount := gcl.retryCount retryCount := gcl.retryCount
for err == nil && for err == nil &&
@@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry(
logger.Printf("waiting %d seconds before retrying\n", i) logger.Printf("waiting %d seconds before retrying\n", i)
time.Sleep(time.Second * time.Duration(i)) time.Sleep(time.Second * time.Duration(i))
retryCount-- retryCount--
resp, err = gcl.client.Get(query) resp, err = gcl.Do(query)
} }
if err != nil { if err != nil {

View File

@@ -11,6 +11,8 @@ const (
accessTokenArg = "access_token" accessTokenArg = "access_token"
) )
const githubMaxPageSize = 100
// Implementation detail, not important to external API. // Implementation detail, not important to external API.
type queryField struct { type queryField struct {
name string name string
@@ -97,13 +99,11 @@ func Path(p string) queryField {
// determine the date of a file. // determine the date of a file.
type RequestConfig struct { type RequestConfig struct {
perPage uint64 perPage uint64
accessToken string
} }
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig { func NewRequestConfig(perPage uint64) RequestConfig {
return RequestConfig{ return RequestConfig{
perPage: perPage, perPage: perPage,
accessToken: accessToken,
} }
} }
@@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
func (rc RequestConfig) makeRequest(path string, query Query) request { func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{} vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage)) vals.Set(perPageArg, fmt.Sprint(rc.perPage))
return request{ return request{
@@ -183,7 +180,7 @@ func (r request) URL() string {
if encoded == "" && query != "" { if encoded == "" && query != "" {
sep = "?" sep = "?"
} }
r.url.RawQuery = encoded + sep + query r.url.RawQuery = query + sep + encoded
return r.url.String() return r.url.String()
} }

View File

@@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) {
{ {
rc: RequestConfig{ rc: RequestConfig{
perPage: perPage, perPage: perPage,
accessToken: accessToken,
}, },
codeQuery: Query{ codeQuery: Query{
Filename("kustomization.yaml"), Filename("kustomization.yaml"),
@@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) {
path: "examples/helloWorld/kustomization.yaml", path: "examples/helloWorld/kustomization.yaml",
expectedCodeQuery: "https://api.github.com/search/code?" + expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", "q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed",
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", "examples/helloWorld/kustomization.yaml?per_page=100",
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", "q=path:examples/helloWorld/kustomization.yaml&per_page=100",
}, },
} }