Merge pull request #1845 from haiyanmeng/work

Make the crawler work
This commit is contained in:
Jeff Regan
2019-11-27 12:07:23 -08:00
committed by GitHub
10 changed files with 198 additions and 29 deletions

View File

@@ -0,0 +1,15 @@
FROM golang:1.11 AS build
ARG GO111MODULE=on
WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl
COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl
RUN go mod download
RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go
FROM scratch
COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=build /go/bin/crawler /
ENTRYPOINT ["/crawler"]
CMD []

View File

@@ -0,0 +1,99 @@
package main
import (
"context"
"fmt"
"net/http"
"os"
"time"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/crawl/index"
"github.com/gomodule/redigo/redis"
)
const (
githubAccessTokenVar = "GITHUB_ACCESS_TOKEN"
redisCacheURL = "REDIS_CACHE_URL"
redisKeyURL = "REDIS_KEY_URL"
retryCount = 3
)
func main() {
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
fmt.Printf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar)
return
}
ctx := context.Background()
idx, err := index.NewKustomizeIndex(ctx)
if err != nil {
fmt.Printf("Could not create an index: %v\n", err)
return
}
cacheURL := os.Getenv(redisCacheURL)
keystoreURL := os.Getenv(redisKeyURL)
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
docs := make(crawler.CrawlSeed, 0)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
docs = append(docs, hit.Document.GetDocument())
}
}
if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err)
}
cache, err := redis.DialURL(cacheURL)
clientCache := &http.Client{}
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
} else {
clientCache = httpclient.NewClient(cache)
}
_, err = redis.DialURL(keystoreURL)
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
os.Exit(1)
}
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith(
github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml")),
)
crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler},
// Converter takes in a plain document and processes it for the
// index.
func(d *doc.Document) (crawler.CrawledDocument, error) {
kdoc := doc.KustomizationDocument{
Document: *d,
}
err := kdoc.ParseYAML()
return &kdoc, err
},
// IndexFunc updates the value in the index.
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
switch d := cdoc.(type) {
case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d)
_, err := idx.Put("", d)
return err
default:
return fmt.Errorf("Type %T not supported", d)
}
},
)
}

View File

@@ -0,0 +1,2 @@
<ADD YOUR GITHUB PERSONAL ACCESS TOKEN HERE WITHOUT A TRAILING NEWLINE>
Run: printf "<your-token>" > github_api_secret.txt

View File

@@ -5,7 +5,9 @@ configmapGenerator:
- name: crawler-http-cache
literals:
- redis-cache-url="redis://redis-http-cache:6379"
- name: redis-keystore
literals:
- keystore-url="redis://redis-docs-keystore:6379"
secretGenerator:
- name: github-access-token

View File

@@ -0,0 +1,43 @@
apiVersion: elasticsearch.cloud.google.com/v1alpha1
kind: ESCluster
metadata:
name: esbasic
spec:
plugin:
pluginList:
- repository-gcs
- ingest-user-agent
- ingest-geoip
config:
env:
example: test
nodegroups:
- name: di
replicas: 2
data: true
ingest: true
config:
jvm:
- Djava.net.preferIPv4Stack=true
- Xms2g
- Xmx2g
es:
path.repo: '["/tmp/es_backup_basic"]'
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- topologyKey: kubernetes.io/hostname
labelSelector:
matchLabels:
es/nodegroup: di
resources:
requests:
memory: 3Gi
limits:
memory: 3Gi
- name: m
replicas: 2
master: true
config:
es:
path.repo: '["/tmp/es_backup_basic"]'

View File

@@ -21,10 +21,6 @@ spec:
httpGet:
path: /liveness
port: backend-port
readinessProbe:
httpGet:
path: /readiness
port: backend-port
ports:
- name: backend-port
containerPort: 8080

View File

@@ -16,11 +16,11 @@ import (
"strings"
"time"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
"sigs.k8s.io/kustomize/api/internal/git"
"sigs.k8s.io/kustomize/api/konfig"
)
var logger = log.New(os.Stdout, "Github Crawler: ",
@@ -34,11 +34,11 @@ type githubCrawler struct {
type GhClient struct {
RequestConfig
retryCount uint64
client *http.Client
retryCount uint64
client *http.Client
accessToken string
}
/*
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
query Query) githubCrawler {
@@ -47,14 +47,13 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
retryCount: retryCount,
client: client,
RequestConfig: RequestConfig{
perPage: githubMaxPageSize,
accessToken: accessToken,
perPage: githubMaxPageSize,
},
accessToken: accessToken,
},
query: query,
}
}
*/
// Implements crawler.Crawler.
func (gc githubCrawler) Crawl(
@@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl(
RequestConfig: gc.client.RequestConfig,
client: &http.Client{Timeout: gc.client.client.Timeout},
retryCount: gc.client.retryCount,
accessToken: gc.client.accessToken,
}
// Since Github returns a max of 1000 results per query, we can use
@@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro
continue
}
}
return fmt.Errorf("file not found: %s", url)
return fmt.Errorf("file not found: %s, error: %v", url, err)
}
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
@@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) {
return gcl.getWithRetry(query)
}
func (gcl GhClient) Do(query string) (*http.Response, error) {
req, err := http.NewRequest("GET", query, nil)
if err != nil {
return nil, err
}
req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken))
return gcl.client.Do(req)
}
func (gcl GhClient) getWithRetry(
query string) (resp *http.Response, err error) {
resp, err = gcl.client.Get(query)
resp, err = gcl.Do(query)
retryCount := gcl.retryCount
for err == nil &&
@@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry(
logger.Printf("waiting %d seconds before retrying\n", i)
time.Sleep(time.Second * time.Duration(i))
retryCount--
resp, err = gcl.client.Get(query)
resp, err = gcl.Do(query)
}
if err != nil {

View File

@@ -11,6 +11,8 @@ const (
accessTokenArg = "access_token"
)
const githubMaxPageSize = 100
// Implementation detail, not important to external API.
type queryField struct {
name string
@@ -96,14 +98,12 @@ func Path(p string) queryField {
// - CommitsRequests: asks Github to list commits made one a file. Useful to
// determine the date of a file.
type RequestConfig struct {
perPage uint64
accessToken string
perPage uint64
}
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
func NewRequestConfig(perPage uint64) RequestConfig {
return RequestConfig{
perPage: perPage,
accessToken: accessToken,
perPage: perPage,
}
}
@@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
return request{
@@ -183,7 +180,7 @@ func (r request) URL() string {
if encoded == "" && query != "" {
sep = "?"
}
r.url.RawQuery = encoded + sep + query
r.url.RawQuery = query + sep + encoded
return r.url.String()
}

View File

@@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) {
{
rc: RequestConfig{
perPage: perPage,
accessToken: accessToken,
},
codeQuery: Query{
Filename("kustomization.yaml"),
@@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) {
path: "examples/helloWorld/kustomization.yaml",
expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
"q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed",
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
"examples/helloWorld/kustomization.yaml?per_page=100",
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
"q=path:examples/helloWorld/kustomization.yaml&per_page=100",
},
}

View File

@@ -43,6 +43,12 @@ type KustomizationDocument struct {
type set map[string]struct{}
func (doc *KustomizationDocument) String() string {
return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath,
doc.DefaultBranch, doc.CreationTime, doc.IsSame,
doc.Kinds, doc.Identifiers, doc.Values)
}
// Implements the CrawlerDocument interface.
func (doc *KustomizationDocument) GetResources() ([]*Document, error) {
isResource := true