mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-13 18:10:59 +00:00
15
api/internal/crawl/cmd/crawler/Dockerfile
Normal file
15
api/internal/crawl/cmd/crawler/Dockerfile
Normal file
@@ -0,0 +1,15 @@
|
||||
FROM golang:1.11 AS build
|
||||
|
||||
ARG GO111MODULE=on
|
||||
|
||||
WORKDIR /go/src/sigs.k8s.io/kustomize/api/internal/crawl
|
||||
COPY . /go/src/sigs.k8s.io/kustomize//api/internal/crawl
|
||||
|
||||
RUN go mod download
|
||||
RUN CGO_ENABLED=0 go install -v ./cmd/crawler/crawler.go
|
||||
|
||||
FROM scratch
|
||||
COPY --from=build /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
|
||||
COPY --from=build /go/bin/crawler /
|
||||
ENTRYPOINT ["/crawler"]
|
||||
CMD []
|
||||
99
api/internal/crawl/cmd/crawler/crawler.go
Normal file
99
api/internal/crawl/cmd/crawler/crawler.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/index"
|
||||
|
||||
"github.com/gomodule/redigo/redis"
|
||||
)
|
||||
|
||||
const (
|
||||
githubAccessTokenVar = "GITHUB_ACCESS_TOKEN"
|
||||
redisCacheURL = "REDIS_CACHE_URL"
|
||||
redisKeyURL = "REDIS_KEY_URL"
|
||||
retryCount = 3
|
||||
)
|
||||
|
||||
func main() {
|
||||
githubToken := os.Getenv(githubAccessTokenVar)
|
||||
if githubToken == "" {
|
||||
fmt.Printf("Must set the variable '%s' to make github requests.\n",
|
||||
githubAccessTokenVar)
|
||||
return
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
idx, err := index.NewKustomizeIndex(ctx)
|
||||
if err != nil {
|
||||
fmt.Printf("Could not create an index: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
cacheURL := os.Getenv(redisCacheURL)
|
||||
keystoreURL := os.Getenv(redisKeyURL)
|
||||
|
||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||
docs := make(crawler.CrawlSeed, 0)
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
docs = append(docs, hit.Document.GetDocument())
|
||||
}
|
||||
}
|
||||
if err := it.Err(); err != nil {
|
||||
fmt.Printf("Error iterating: %v\n", err)
|
||||
}
|
||||
|
||||
cache, err := redis.DialURL(cacheURL)
|
||||
clientCache := &http.Client{}
|
||||
if err != nil {
|
||||
fmt.Printf("Error: redis could not make a connection: %v\n", err)
|
||||
} else {
|
||||
clientCache = httpclient.NewClient(cache)
|
||||
}
|
||||
|
||||
_, err = redis.DialURL(keystoreURL)
|
||||
if err != nil {
|
||||
fmt.Printf("Error: redis could not make a connection: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
|
||||
github.QueryWith(
|
||||
github.Filename("kustomization.yaml"),
|
||||
github.Filename("kustomization.yml")),
|
||||
)
|
||||
|
||||
crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler},
|
||||
// Converter takes in a plain document and processes it for the
|
||||
// index.
|
||||
func(d *doc.Document) (crawler.CrawledDocument, error) {
|
||||
kdoc := doc.KustomizationDocument{
|
||||
Document: *d,
|
||||
}
|
||||
|
||||
err := kdoc.ParseYAML()
|
||||
return &kdoc, err
|
||||
},
|
||||
// IndexFunc updates the value in the index.
|
||||
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
||||
switch d := cdoc.(type) {
|
||||
case *doc.KustomizationDocument:
|
||||
fmt.Println("Inserting: ", d)
|
||||
_, err := idx.Put("", d)
|
||||
return err
|
||||
default:
|
||||
return fmt.Errorf("Type %T not supported", d)
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,2 @@
|
||||
<ADD YOUR GITHUB PERSONAL ACCESS TOKEN HERE WITHOUT A TRAILING NEWLINE>
|
||||
Run: printf "<your-token>" > github_api_secret.txt
|
||||
@@ -5,7 +5,9 @@ configmapGenerator:
|
||||
- name: crawler-http-cache
|
||||
literals:
|
||||
- redis-cache-url="redis://redis-http-cache:6379"
|
||||
|
||||
- name: redis-keystore
|
||||
literals:
|
||||
- keystore-url="redis://redis-docs-keystore:6379"
|
||||
|
||||
secretGenerator:
|
||||
- name: github-access-token
|
||||
|
||||
43
api/internal/crawl/config/elastic/escluster.yaml
Normal file
43
api/internal/crawl/config/elastic/escluster.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
apiVersion: elasticsearch.cloud.google.com/v1alpha1
|
||||
kind: ESCluster
|
||||
metadata:
|
||||
name: esbasic
|
||||
spec:
|
||||
plugin:
|
||||
pluginList:
|
||||
- repository-gcs
|
||||
- ingest-user-agent
|
||||
- ingest-geoip
|
||||
config:
|
||||
env:
|
||||
example: test
|
||||
nodegroups:
|
||||
- name: di
|
||||
replicas: 2
|
||||
data: true
|
||||
ingest: true
|
||||
config:
|
||||
jvm:
|
||||
- Djava.net.preferIPv4Stack=true
|
||||
- Xms2g
|
||||
- Xmx2g
|
||||
es:
|
||||
path.repo: '["/tmp/es_backup_basic"]'
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- topologyKey: kubernetes.io/hostname
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
es/nodegroup: di
|
||||
resources:
|
||||
requests:
|
||||
memory: 3Gi
|
||||
limits:
|
||||
memory: 3Gi
|
||||
- name: m
|
||||
replicas: 2
|
||||
master: true
|
||||
config:
|
||||
es:
|
||||
path.repo: '["/tmp/es_backup_basic"]'
|
||||
@@ -21,10 +21,6 @@ spec:
|
||||
httpGet:
|
||||
path: /liveness
|
||||
port: backend-port
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readiness
|
||||
port: backend-port
|
||||
ports:
|
||||
- name: backend-port
|
||||
containerPort: 8080
|
||||
|
||||
@@ -16,11 +16,11 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"sigs.k8s.io/kustomize/api/internal/git"
|
||||
"sigs.k8s.io/kustomize/api/konfig"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/doc"
|
||||
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
|
||||
"sigs.k8s.io/kustomize/api/internal/git"
|
||||
"sigs.k8s.io/kustomize/api/konfig"
|
||||
)
|
||||
|
||||
var logger = log.New(os.Stdout, "Github Crawler: ",
|
||||
@@ -34,11 +34,11 @@ type githubCrawler struct {
|
||||
|
||||
type GhClient struct {
|
||||
RequestConfig
|
||||
retryCount uint64
|
||||
client *http.Client
|
||||
retryCount uint64
|
||||
client *http.Client
|
||||
accessToken string
|
||||
}
|
||||
|
||||
/*
|
||||
func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
query Query) githubCrawler {
|
||||
|
||||
@@ -47,14 +47,13 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
|
||||
retryCount: retryCount,
|
||||
client: client,
|
||||
RequestConfig: RequestConfig{
|
||||
perPage: githubMaxPageSize,
|
||||
accessToken: accessToken,
|
||||
perPage: githubMaxPageSize,
|
||||
},
|
||||
accessToken: accessToken,
|
||||
},
|
||||
query: query,
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// Implements crawler.Crawler.
|
||||
func (gc githubCrawler) Crawl(
|
||||
@@ -64,6 +63,7 @@ func (gc githubCrawler) Crawl(
|
||||
RequestConfig: gc.client.RequestConfig,
|
||||
client: &http.Client{Timeout: gc.client.client.Timeout},
|
||||
retryCount: gc.client.retryCount,
|
||||
accessToken: gc.client.accessToken,
|
||||
}
|
||||
|
||||
// Since Github returns a max of 1000 results per query, we can use
|
||||
@@ -129,7 +129,7 @@ func (gc githubCrawler) FetchDocument(ctx context.Context, d *doc.Document) erro
|
||||
continue
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("file not found: %s", url)
|
||||
return fmt.Errorf("file not found: %s, error: %v", url, err)
|
||||
}
|
||||
|
||||
func (gc githubCrawler) SetCreated(ctx context.Context, d *doc.Document) error {
|
||||
@@ -534,10 +534,20 @@ func (gcl GhClient) GetRawUserContent(query string) (*http.Response, error) {
|
||||
return gcl.getWithRetry(query)
|
||||
}
|
||||
|
||||
func (gcl GhClient) Do(query string) (*http.Response, error) {
|
||||
req, err := http.NewRequest("GET", query, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken))
|
||||
return gcl.client.Do(req)
|
||||
}
|
||||
|
||||
func (gcl GhClient) getWithRetry(
|
||||
query string) (resp *http.Response, err error) {
|
||||
|
||||
resp, err = gcl.client.Get(query)
|
||||
resp, err = gcl.Do(query)
|
||||
|
||||
retryCount := gcl.retryCount
|
||||
|
||||
for err == nil &&
|
||||
@@ -556,7 +566,7 @@ func (gcl GhClient) getWithRetry(
|
||||
logger.Printf("waiting %d seconds before retrying\n", i)
|
||||
time.Sleep(time.Second * time.Duration(i))
|
||||
retryCount--
|
||||
resp, err = gcl.client.Get(query)
|
||||
resp, err = gcl.Do(query)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
|
||||
@@ -11,6 +11,8 @@ const (
|
||||
accessTokenArg = "access_token"
|
||||
)
|
||||
|
||||
const githubMaxPageSize = 100
|
||||
|
||||
// Implementation detail, not important to external API.
|
||||
type queryField struct {
|
||||
name string
|
||||
@@ -96,14 +98,12 @@ func Path(p string) queryField {
|
||||
// - CommitsRequests: asks Github to list commits made one a file. Useful to
|
||||
// determine the date of a file.
|
||||
type RequestConfig struct {
|
||||
perPage uint64
|
||||
accessToken string
|
||||
perPage uint64
|
||||
}
|
||||
|
||||
func NewRequestConfig(perPage uint64, accessToken string) RequestConfig {
|
||||
func NewRequestConfig(perPage uint64) RequestConfig {
|
||||
return RequestConfig{
|
||||
perPage: perPage,
|
||||
accessToken: accessToken,
|
||||
perPage: perPage,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,9 +139,6 @@ func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
|
||||
|
||||
func (rc RequestConfig) makeRequest(path string, query Query) request {
|
||||
vals := url.Values{}
|
||||
if rc.accessToken != "" {
|
||||
vals.Set(accessTokenArg, rc.accessToken)
|
||||
}
|
||||
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
|
||||
|
||||
return request{
|
||||
@@ -183,7 +180,7 @@ func (r request) URL() string {
|
||||
if encoded == "" && query != "" {
|
||||
sep = "?"
|
||||
}
|
||||
r.url.RawQuery = encoded + sep + query
|
||||
r.url.RawQuery = query + sep + encoded
|
||||
return r.url.String()
|
||||
}
|
||||
|
||||
|
||||
@@ -84,7 +84,6 @@ func TestGithubSearchQuery(t *testing.T) {
|
||||
{
|
||||
rc: RequestConfig{
|
||||
perPage: perPage,
|
||||
accessToken: accessToken,
|
||||
},
|
||||
codeQuery: Query{
|
||||
Filename("kustomization.yaml"),
|
||||
@@ -94,13 +93,13 @@ func TestGithubSearchQuery(t *testing.T) {
|
||||
path: "examples/helloWorld/kustomization.yaml",
|
||||
|
||||
expectedCodeQuery: "https://api.github.com/search/code?" +
|
||||
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
|
||||
"q=filename:kustomization.yaml+size:64..128&order=desc&per_page=100&sort=indexed",
|
||||
|
||||
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
|
||||
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
|
||||
"examples/helloWorld/kustomization.yaml?per_page=100",
|
||||
|
||||
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
|
||||
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
|
||||
"q=path:examples/helloWorld/kustomization.yaml&per_page=100",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,12 @@ type KustomizationDocument struct {
|
||||
|
||||
type set map[string]struct{}
|
||||
|
||||
func (doc *KustomizationDocument) String() string {
|
||||
return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath,
|
||||
doc.DefaultBranch, doc.CreationTime, doc.IsSame,
|
||||
doc.Kinds, doc.Identifiers, doc.Values)
|
||||
}
|
||||
|
||||
// Implements the CrawlerDocument interface.
|
||||
func (doc *KustomizationDocument) GetResources() ([]*Document, error) {
|
||||
isResource := true
|
||||
|
||||
Reference in New Issue
Block a user