Mulitple improvements of the crawler

1) Set document IDs to avoid duplicating documents;
2) Set the `creationTime` field of each document in the index;
3) set the `values`, `kinds` and `identifiers` fields for all documents;
4) Add a `Copy` method into the `Document` struct: this fixes the issue
where all the documents existing in the index point to the same Document
object;
5) Avoid using keystore redis;
6) Set imagePullPolicy to `Always` for crawler jobs.
This commit is contained in:
Haiyan Meng
2019-12-05 09:51:22 -08:00
parent 54b1549586
commit bffc0d7071
13 changed files with 125 additions and 36 deletions

View File

@@ -39,16 +39,16 @@ func main() {
} }
cacheURL := os.Getenv(redisCacheURL) cacheURL := os.Getenv(redisCacheURL)
keystoreURL := os.Getenv(redisKeyURL)
query := []byte(`{ "query":{ "match_all":{} } }`) query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second) it := idx.IterateQuery(query, 10000, 60*time.Second)
docs := make(crawler.CrawlSeed, 0) docs := make(crawler.CrawlSeed, 0)
for it.Next() { for it.Next() {
for _, hit := range it.Value().Hits.Hits { for _, hit := range it.Value().Hits.Hits {
docs = append(docs, hit.Document.GetDocument()) docs = append(docs, hit.Document.Copy())
} }
} }
if err := it.Err(); err != nil { if err := it.Err(); err != nil {
fmt.Printf("Error iterating: %v\n", err) fmt.Printf("Error iterating: %v\n", err)
} }
@@ -61,12 +61,6 @@ func main() {
clientCache = httpclient.NewClient(cache) clientCache = httpclient.NewClient(cache)
} }
_, err = redis.DialURL(keystoreURL)
if err != nil {
fmt.Printf("Error: redis could not make a connection: %v\n", err)
os.Exit(1)
}
ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith( github.QueryWith(
github.Filename("kustomization.yaml"), github.Filename("kustomization.yaml"),
@@ -88,8 +82,8 @@ func main() {
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
switch d := cdoc.(type) { switch d := cdoc.(type) {
case *doc.KustomizationDocument: case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d) fmt.Println("Inserting: ", d.ID(), d)
_, err := idx.Put("", d) _, err := idx.Put(d.ID(), d)
return err return err
default: default:
return fmt.Errorf("type %T not supported", d) return fmt.Errorf("type %T not supported", d)

View File

@@ -12,6 +12,7 @@ spec:
containers: containers:
- name: crawler - name: crawler
image: gcr.io/kustomize-search/crawler:latest image: gcr.io/kustomize-search/crawler:latest
imagePullPolicy: Always
env: env:
- name: GITHUB_ACCESS_TOKEN - name: GITHUB_ACCESS_TOKEN
valueFrom: valueFrom:

View File

@@ -9,6 +9,7 @@ spec:
containers: containers:
- name: crawler - name: crawler
image: gcr.io/kustomize-search/crawler:latest image: gcr.io/kustomize-search/crawler:latest
imagePullPolicy: Always
env: env:
- name: GITHUB_ACCESS_TOKEN - name: GITHUB_ACCESS_TOKEN
valueFrom: valueFrom:

View File

@@ -17,6 +17,7 @@ spec:
containers: containers:
- name: kustomize-search - name: kustomize-search
image: gcr.io/kustomize-search/backend:latest image: gcr.io/kustomize-search/backend:latest
imagePullPolicy: Always
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /liveness path: /liveness

View File

@@ -17,6 +17,7 @@ spec:
containers: containers:
- name: frontend - name: frontend
image: gcr.io/kustomize-search/frontend:latest image: gcr.io/kustomize-search/frontend:latest
imagePullPolicy: Always
ports: ports:
- name: frontend-port - name: frontend-port
containerPort: 80 containerPort: 80

View File

@@ -102,11 +102,9 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
} }
doCrawl := func(docsPtr *CrawlSeed) { doCrawl := func(docsPtr *CrawlSeed) {
for len(*docsPtr) > 0 { n := len(*docsPtr)
back := len(*docsPtr) - 1 for i := 0; i < n; i++ {
next := (*docsPtr)[back] next := (*docsPtr)[i]
*docsPtr = (*docsPtr)[:back]
match := findMatch(next) match := findMatch(next)
if match == nil { if match == nil {
logIfErr(fmt.Errorf( logIfErr(fmt.Errorf(
@@ -114,24 +112,28 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
continue continue
} }
logger.Println("Crawling ", next.RepositoryURL, next.FilePath)
err := match.FetchDocument(ctx, next) err := match.FetchDocument(ctx, next)
logIfErr(err) logIfErr(err)
// If there was no change or there is an error, we don't have // If there was no change or there is an error, we don't have
// to branch out, since the dependencies are already in the // to branch out, since the dependencies are already in the
// index, or we cannot find the document. // index, or we cannot find the document.
if err != nil || next.WasCached() { if err != nil || next.WasCached() {
if next.WasCached() {
logger.Println(next.RepositoryURL, next.FilePath, "is cached already")
}
continue continue
} }
logIfErr(match.SetCreated(ctx, next))
cdoc, err := conv(next) cdoc, err := conv(next)
logIfErr(err) logIfErr(err)
if err != nil {
continue
}
addBranches(cdoc, match) addBranches(cdoc, match)
} }
} }
// Exploit seed to update bulk of corpus. // Exploit seed to update bulk of corpus.
logger.Printf("updating %d documents from seed\n", len(seed)) logger.Printf("updating %d documents from seed\n", len(seed))
doCrawl(&seed) doCrawl(&seed)

View File

@@ -29,7 +29,7 @@ type testCrawler struct {
} }
func (c testCrawler) Match(d *doc.Document) bool { func (c testCrawler) Match(d *doc.Document) bool {
return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix) return d != nil
} }
func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error { func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {

View File

@@ -133,8 +133,12 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error
} }
func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error { func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error {
fs := GhFileSpec{} fs := GhFileSpec{
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath Path: d.FilePath,
Repository: GitRepository{
FullName: d.RepositoryFullName(),
},
}
creationTime, err := gc.client.GetFileCreationTime(fs) creationTime, err := gc.client.GetFileCreationTime(fs)
if err != nil { if err != nil {
return err return err
@@ -185,9 +189,9 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
for _, file := range page.Parsed.Items { for _, file := range page.Parsed.Items {
k, err := kustomizationResultAdapter(gcl, file) k, err := kustomizationResultAdapter(gcl, file)
if err != nil { if err != nil {
logger.Printf("kustomizationResultAdapter failed: %v", err)
errs = append(errs, err) errs = append(errs, err)
errorCnt++ errorCnt++
continue
} }
output <- k output <- k
totalCnt++ totalCnt++
@@ -224,6 +228,18 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) (
RepositoryURL: k.Repository.URL, RepositoryURL: k.Repository.URL,
}, },
} }
logger.Printf("Set the creationTime field")
creationTime, err := gcl.GetFileCreationTime(k)
if err != nil {
logger.Printf("GetFileCreationTime failed: %v", err)
return &d, err
}
d.CreationTime = &creationTime
if err := d.ParseYAML(); err != nil {
logger.Printf("ParseYAML failed: %v", err)
return &d, err
}
return &d, nil return &d, nil
} }
@@ -410,13 +426,15 @@ func (e multiError) Error() string {
return strings.Join(strs, "\n") return strings.Join(strs, "\n")
} }
type GitRepository struct {
API string `json:"url,omitempty"`
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
}
type GhFileSpec struct { type GhFileSpec struct {
Path string `json:"path,omitempty"` Path string `json:"path,omitempty"`
Repository struct { Repository GitRepository `json:"repository,omitempty"`
API string `json:"url,omitempty"`
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
} `json:"repository,omitempty"`
} }
type githubResponse struct { type githubResponse struct {

View File

@@ -7,7 +7,7 @@ import (
) )
const ( const (
perPageArg = "per_page" perPageArg = "per_page"
) )
const githubMaxPageSize = 100 const githubMaxPageSize = 100

View File

@@ -68,7 +68,7 @@ func TestQueryType(t *testing.T) {
func TestGithubSearchQuery(t *testing.T) { func TestGithubSearchQuery(t *testing.T) {
const ( const (
perPage = 100 perPage = 100
) )
testCases := []struct { testCases := []struct {
@@ -82,7 +82,7 @@ func TestGithubSearchQuery(t *testing.T) {
}{ }{
{ {
rc: RequestConfig{ rc: RequestConfig{
perPage: perPage, perPage: perPage,
}, },
codeQuery: Query{ codeQuery: Query{
Filename("kustomization.yaml"), Filename("kustomization.yaml"),

View File

@@ -44,9 +44,9 @@ type KustomizationDocument struct {
type set map[string]struct{} type set map[string]struct{}
func (doc *KustomizationDocument) String() string { func (doc *KustomizationDocument) String() string {
return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath, return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v",
doc.DefaultBranch, doc.CreationTime, doc.IsSame, doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime,
doc.Kinds, doc.Identifiers, doc.Values) doc.IsSame, doc.Kinds, len(doc.Identifiers), len(doc.Values))
} }
// Implements the CrawlerDocument interface. // Implements the CrawlerDocument interface.

View File

@@ -1,7 +1,10 @@
package doc package doc
import ( import (
"crypto/sha256"
"fmt"
"path" "path"
"strings"
"time" "time"
"sigs.k8s.io/kustomize/api/internal/git" "sigs.k8s.io/kustomize/api/internal/git"
@@ -21,6 +24,17 @@ func (doc *Document) GetDocument() *Document {
return doc return doc
} }
func (doc *Document) Copy() *Document {
return &Document{
RepositoryURL: doc.RepositoryURL,
FilePath: doc.FilePath,
DefaultBranch: doc.DefaultBranch,
DocumentData: doc.DocumentData,
CreationTime: doc.CreationTime,
IsSame: doc.IsSame,
}
}
// Implements the CrawlerDocument interface. // Implements the CrawlerDocument interface.
func (doc *Document) WasCached() bool { func (doc *Document) WasCached() bool {
return doc.IsSame return doc.IsSame
@@ -53,6 +67,22 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) {
} }
func (doc *Document) ID() string { func (doc *Document) ID() string {
return doc.RepositoryURL + "/" + sum := sha256.Sum256([]byte(strings.Join(
doc.DefaultBranch + "/" + doc.FilePath []string{
doc.RepositoryURL,
doc.DefaultBranch,
doc.FilePath,
},
"---|---")))
return fmt.Sprintf("%x", sum)
}
func (doc *Document) RepositoryFullName() string {
doc.RepositoryURL = strings.TrimRight(doc.RepositoryURL, "/")
sections := strings.Split(doc.RepositoryURL, "/")
l := len(sections)
if l < 2 {
return doc.RepositoryURL
}
return path.Join(sections[l-2], sections[l-1])
} }

View File

@@ -62,3 +62,44 @@ func TestFromRelativePath(t *testing.T) {
} }
} }
} }
func TestDocument_RepositoryFullName(t *testing.T) {
testCases := []struct {
doc Document
expectedRepositoryFullName string
}{
{
doc: Document{
RepositoryURL: "https://github.com/user/repo",
},
expectedRepositoryFullName: "user/repo",
},
{
doc: Document{
RepositoryURL: "https://github.com//user/repo////",
},
expectedRepositoryFullName: "user/repo",
},
{
doc: Document{
RepositoryURL: "repo/",
},
expectedRepositoryFullName: "repo",
},
{
doc: Document{
RepositoryURL: "",
},
expectedRepositoryFullName: "",
},
}
for _, tc := range testCases {
returnedRepositoryFullName := tc.doc.RepositoryFullName()
if returnedRepositoryFullName != tc.expectedRepositoryFullName {
t.Errorf("RepositoryFullName expected %s, got %s",
tc.expectedRepositoryFullName,
returnedRepositoryFullName)
}
}
}