mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 17:12:51 +00:00
Mulitple improvements of the crawler
1) Set document IDs to avoid duplicating documents; 2) Set the `creationTime` field of each document in the index; 3) set the `values`, `kinds` and `identifiers` fields for all documents; 4) Add a `Copy` method into the `Document` struct: this fixes the issue where all the documents existing in the index point to the same Document object; 5) Avoid using keystore redis; 6) Set imagePullPolicy to `Always` for crawler jobs.
This commit is contained in:
@@ -39,16 +39,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
cacheURL := os.Getenv(redisCacheURL)
|
cacheURL := os.Getenv(redisCacheURL)
|
||||||
keystoreURL := os.Getenv(redisKeyURL)
|
|
||||||
|
|
||||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||||
docs := make(crawler.CrawlSeed, 0)
|
docs := make(crawler.CrawlSeed, 0)
|
||||||
for it.Next() {
|
for it.Next() {
|
||||||
for _, hit := range it.Value().Hits.Hits {
|
for _, hit := range it.Value().Hits.Hits {
|
||||||
docs = append(docs, hit.Document.GetDocument())
|
docs = append(docs, hit.Document.Copy())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := it.Err(); err != nil {
|
if err := it.Err(); err != nil {
|
||||||
fmt.Printf("Error iterating: %v\n", err)
|
fmt.Printf("Error iterating: %v\n", err)
|
||||||
}
|
}
|
||||||
@@ -61,12 +61,6 @@ func main() {
|
|||||||
clientCache = httpclient.NewClient(cache)
|
clientCache = httpclient.NewClient(cache)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = redis.DialURL(keystoreURL)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Printf("Error: redis could not make a connection: %v\n", err)
|
|
||||||
os.Exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
|
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
|
||||||
github.QueryWith(
|
github.QueryWith(
|
||||||
github.Filename("kustomization.yaml"),
|
github.Filename("kustomization.yaml"),
|
||||||
@@ -88,8 +82,8 @@ func main() {
|
|||||||
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
||||||
switch d := cdoc.(type) {
|
switch d := cdoc.(type) {
|
||||||
case *doc.KustomizationDocument:
|
case *doc.KustomizationDocument:
|
||||||
fmt.Println("Inserting: ", d)
|
fmt.Println("Inserting: ", d.ID(), d)
|
||||||
_, err := idx.Put("", d)
|
_, err := idx.Put(d.ID(), d)
|
||||||
return err
|
return err
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("type %T not supported", d)
|
return fmt.Errorf("type %T not supported", d)
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: crawler
|
- name: crawler
|
||||||
image: gcr.io/kustomize-search/crawler:latest
|
image: gcr.io/kustomize-search/crawler:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
env:
|
env:
|
||||||
- name: GITHUB_ACCESS_TOKEN
|
- name: GITHUB_ACCESS_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: crawler
|
- name: crawler
|
||||||
image: gcr.io/kustomize-search/crawler:latest
|
image: gcr.io/kustomize-search/crawler:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
env:
|
env:
|
||||||
- name: GITHUB_ACCESS_TOKEN
|
- name: GITHUB_ACCESS_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: kustomize-search
|
- name: kustomize-search
|
||||||
image: gcr.io/kustomize-search/backend:latest
|
image: gcr.io/kustomize-search/backend:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /liveness
|
path: /liveness
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ spec:
|
|||||||
containers:
|
containers:
|
||||||
- name: frontend
|
- name: frontend
|
||||||
image: gcr.io/kustomize-search/frontend:latest
|
image: gcr.io/kustomize-search/frontend:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
ports:
|
ports:
|
||||||
- name: frontend-port
|
- name: frontend-port
|
||||||
containerPort: 80
|
containerPort: 80
|
||||||
|
|||||||
@@ -102,11 +102,9 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
|
|||||||
}
|
}
|
||||||
|
|
||||||
doCrawl := func(docsPtr *CrawlSeed) {
|
doCrawl := func(docsPtr *CrawlSeed) {
|
||||||
for len(*docsPtr) > 0 {
|
n := len(*docsPtr)
|
||||||
back := len(*docsPtr) - 1
|
for i := 0; i < n; i++ {
|
||||||
next := (*docsPtr)[back]
|
next := (*docsPtr)[i]
|
||||||
*docsPtr = (*docsPtr)[:back]
|
|
||||||
|
|
||||||
match := findMatch(next)
|
match := findMatch(next)
|
||||||
if match == nil {
|
if match == nil {
|
||||||
logIfErr(fmt.Errorf(
|
logIfErr(fmt.Errorf(
|
||||||
@@ -114,24 +112,28 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.Println("Crawling ", next.RepositoryURL, next.FilePath)
|
||||||
err := match.FetchDocument(ctx, next)
|
err := match.FetchDocument(ctx, next)
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
// If there was no change or there is an error, we don't have
|
// If there was no change or there is an error, we don't have
|
||||||
// to branch out, since the dependencies are already in the
|
// to branch out, since the dependencies are already in the
|
||||||
// index, or we cannot find the document.
|
// index, or we cannot find the document.
|
||||||
if err != nil || next.WasCached() {
|
if err != nil || next.WasCached() {
|
||||||
|
if next.WasCached() {
|
||||||
|
logger.Println(next.RepositoryURL, next.FilePath, "is cached already")
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logIfErr(match.SetCreated(ctx, next))
|
||||||
|
|
||||||
cdoc, err := conv(next)
|
cdoc, err := conv(next)
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
addBranches(cdoc, match)
|
addBranches(cdoc, match)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Exploit seed to update bulk of corpus.
|
// Exploit seed to update bulk of corpus.
|
||||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||||
doCrawl(&seed)
|
doCrawl(&seed)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ type testCrawler struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c testCrawler) Match(d *doc.Document) bool {
|
func (c testCrawler) Match(d *doc.Document) bool {
|
||||||
return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix)
|
return d != nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
|
func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
|
||||||
|
|||||||
@@ -133,8 +133,12 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error {
|
func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error {
|
||||||
fs := GhFileSpec{}
|
fs := GhFileSpec{
|
||||||
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
|
Path: d.FilePath,
|
||||||
|
Repository: GitRepository{
|
||||||
|
FullName: d.RepositoryFullName(),
|
||||||
|
},
|
||||||
|
}
|
||||||
creationTime, err := gc.client.GetFileCreationTime(fs)
|
creationTime, err := gc.client.GetFileCreationTime(fs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -185,9 +189,9 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
|
|||||||
for _, file := range page.Parsed.Items {
|
for _, file := range page.Parsed.Items {
|
||||||
k, err := kustomizationResultAdapter(gcl, file)
|
k, err := kustomizationResultAdapter(gcl, file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
logger.Printf("kustomizationResultAdapter failed: %v", err)
|
||||||
errs = append(errs, err)
|
errs = append(errs, err)
|
||||||
errorCnt++
|
errorCnt++
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
output <- k
|
output <- k
|
||||||
totalCnt++
|
totalCnt++
|
||||||
@@ -224,6 +228,18 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) (
|
|||||||
RepositoryURL: k.Repository.URL,
|
RepositoryURL: k.Repository.URL,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
logger.Printf("Set the creationTime field")
|
||||||
|
creationTime, err := gcl.GetFileCreationTime(k)
|
||||||
|
if err != nil {
|
||||||
|
logger.Printf("GetFileCreationTime failed: %v", err)
|
||||||
|
return &d, err
|
||||||
|
}
|
||||||
|
d.CreationTime = &creationTime
|
||||||
|
|
||||||
|
if err := d.ParseYAML(); err != nil {
|
||||||
|
logger.Printf("ParseYAML failed: %v", err)
|
||||||
|
return &d, err
|
||||||
|
}
|
||||||
|
|
||||||
return &d, nil
|
return &d, nil
|
||||||
}
|
}
|
||||||
@@ -410,13 +426,15 @@ func (e multiError) Error() string {
|
|||||||
return strings.Join(strs, "\n")
|
return strings.Join(strs, "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type GitRepository struct {
|
||||||
|
API string `json:"url,omitempty"`
|
||||||
|
URL string `json:"html_url,omitempty"`
|
||||||
|
FullName string `json:"full_name,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type GhFileSpec struct {
|
type GhFileSpec struct {
|
||||||
Path string `json:"path,omitempty"`
|
Path string `json:"path,omitempty"`
|
||||||
Repository struct {
|
Repository GitRepository `json:"repository,omitempty"`
|
||||||
API string `json:"url,omitempty"`
|
|
||||||
URL string `json:"html_url,omitempty"`
|
|
||||||
FullName string `json:"full_name,omitempty"`
|
|
||||||
} `json:"repository,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type githubResponse struct {
|
type githubResponse struct {
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
perPageArg = "per_page"
|
perPageArg = "per_page"
|
||||||
)
|
)
|
||||||
|
|
||||||
const githubMaxPageSize = 100
|
const githubMaxPageSize = 100
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ func TestQueryType(t *testing.T) {
|
|||||||
|
|
||||||
func TestGithubSearchQuery(t *testing.T) {
|
func TestGithubSearchQuery(t *testing.T) {
|
||||||
const (
|
const (
|
||||||
perPage = 100
|
perPage = 100
|
||||||
)
|
)
|
||||||
|
|
||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
@@ -82,7 +82,7 @@ func TestGithubSearchQuery(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
rc: RequestConfig{
|
rc: RequestConfig{
|
||||||
perPage: perPage,
|
perPage: perPage,
|
||||||
},
|
},
|
||||||
codeQuery: Query{
|
codeQuery: Query{
|
||||||
Filename("kustomization.yaml"),
|
Filename("kustomization.yaml"),
|
||||||
|
|||||||
@@ -44,9 +44,9 @@ type KustomizationDocument struct {
|
|||||||
type set map[string]struct{}
|
type set map[string]struct{}
|
||||||
|
|
||||||
func (doc *KustomizationDocument) String() string {
|
func (doc *KustomizationDocument) String() string {
|
||||||
return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath,
|
return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v",
|
||||||
doc.DefaultBranch, doc.CreationTime, doc.IsSame,
|
doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime,
|
||||||
doc.Kinds, doc.Identifiers, doc.Values)
|
doc.IsSame, doc.Kinds, len(doc.Identifiers), len(doc.Values))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Implements the CrawlerDocument interface.
|
// Implements the CrawlerDocument interface.
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
package doc
|
package doc
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"crypto/sha256"
|
||||||
|
"fmt"
|
||||||
"path"
|
"path"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"sigs.k8s.io/kustomize/api/internal/git"
|
"sigs.k8s.io/kustomize/api/internal/git"
|
||||||
@@ -21,6 +24,17 @@ func (doc *Document) GetDocument() *Document {
|
|||||||
return doc
|
return doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (doc *Document) Copy() *Document {
|
||||||
|
return &Document{
|
||||||
|
RepositoryURL: doc.RepositoryURL,
|
||||||
|
FilePath: doc.FilePath,
|
||||||
|
DefaultBranch: doc.DefaultBranch,
|
||||||
|
DocumentData: doc.DocumentData,
|
||||||
|
CreationTime: doc.CreationTime,
|
||||||
|
IsSame: doc.IsSame,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Implements the CrawlerDocument interface.
|
// Implements the CrawlerDocument interface.
|
||||||
func (doc *Document) WasCached() bool {
|
func (doc *Document) WasCached() bool {
|
||||||
return doc.IsSame
|
return doc.IsSame
|
||||||
@@ -53,6 +67,22 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (doc *Document) ID() string {
|
func (doc *Document) ID() string {
|
||||||
return doc.RepositoryURL + "/" +
|
sum := sha256.Sum256([]byte(strings.Join(
|
||||||
doc.DefaultBranch + "/" + doc.FilePath
|
[]string{
|
||||||
|
doc.RepositoryURL,
|
||||||
|
doc.DefaultBranch,
|
||||||
|
doc.FilePath,
|
||||||
|
},
|
||||||
|
"---|---")))
|
||||||
|
return fmt.Sprintf("%x", sum)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (doc *Document) RepositoryFullName() string {
|
||||||
|
doc.RepositoryURL = strings.TrimRight(doc.RepositoryURL, "/")
|
||||||
|
sections := strings.Split(doc.RepositoryURL, "/")
|
||||||
|
l := len(sections)
|
||||||
|
if l < 2 {
|
||||||
|
return doc.RepositoryURL
|
||||||
|
}
|
||||||
|
return path.Join(sections[l-2], sections[l-1])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,3 +62,44 @@ func TestFromRelativePath(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDocument_RepositoryFullName(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
doc Document
|
||||||
|
expectedRepositoryFullName string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
doc: Document{
|
||||||
|
RepositoryURL: "https://github.com/user/repo",
|
||||||
|
},
|
||||||
|
expectedRepositoryFullName: "user/repo",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
doc: Document{
|
||||||
|
RepositoryURL: "https://github.com//user/repo////",
|
||||||
|
},
|
||||||
|
expectedRepositoryFullName: "user/repo",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
doc: Document{
|
||||||
|
RepositoryURL: "repo/",
|
||||||
|
},
|
||||||
|
expectedRepositoryFullName: "repo",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
doc: Document{
|
||||||
|
RepositoryURL: "",
|
||||||
|
},
|
||||||
|
expectedRepositoryFullName: "",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
returnedRepositoryFullName := tc.doc.RepositoryFullName()
|
||||||
|
if returnedRepositoryFullName != tc.expectedRepositoryFullName {
|
||||||
|
t.Errorf("RepositoryFullName expected %s, got %s",
|
||||||
|
tc.expectedRepositoryFullName,
|
||||||
|
returnedRepositoryFullName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user