From bffc0d7071113ba3055fad4dac104ecaece1745d Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Thu, 5 Dec 2019 09:51:22 -0800 Subject: [PATCH] Mulitple improvements of the crawler 1) Set document IDs to avoid duplicating documents; 2) Set the `creationTime` field of each document in the index; 3) set the `values`, `kinds` and `identifiers` fields for all documents; 4) Add a `Copy` method into the `Document` struct: this fixes the issue where all the documents existing in the index point to the same Document object; 5) Avoid using keystore redis; 6) Set imagePullPolicy to `Always` for crawler jobs. --- api/internal/crawl/cmd/crawler/crawler.go | 14 ++----- .../crawl/config/crawler/cronjob/cronjob.yaml | 1 + .../crawl/config/crawler/job/job.yaml | 1 + .../config/webapp/backend/deployment.yaml | 1 + .../config/webapp/frontend/deployment.yaml | 1 + api/internal/crawl/crawler/crawler.go | 18 ++++---- api/internal/crawl/crawler/crawler_test.go | 2 +- api/internal/crawl/crawler/github/crawler.go | 36 ++++++++++++---- api/internal/crawl/crawler/github/queries.go | 2 +- .../crawl/crawler/github/queries_test.go | 4 +- api/internal/crawl/doc/doc.go | 6 +-- api/internal/crawl/doc/docname.go | 34 ++++++++++++++- api/internal/crawl/doc/docname_test.go | 41 +++++++++++++++++++ 13 files changed, 125 insertions(+), 36 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index c8f01be0a..ae0abdbbb 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -39,16 +39,16 @@ func main() { } cacheURL := os.Getenv(redisCacheURL) - keystoreURL := os.Getenv(redisKeyURL) query := []byte(`{ "query":{ "match_all":{} } }`) it := idx.IterateQuery(query, 10000, 60*time.Second) docs := make(crawler.CrawlSeed, 0) for it.Next() { for _, hit := range it.Value().Hits.Hits { - docs = append(docs, hit.Document.GetDocument()) + docs = append(docs, hit.Document.Copy()) } } + if err := it.Err(); err != nil { fmt.Printf("Error iterating: %v\n", err) } @@ -61,12 +61,6 @@ func main() { clientCache = httpclient.NewClient(cache) } - _, err = redis.DialURL(keystoreURL) - if err != nil { - fmt.Printf("Error: redis could not make a connection: %v\n", err) - os.Exit(1) - } - ghCrawler := github.NewCrawler(githubToken, retryCount, clientCache, github.QueryWith( github.Filename("kustomization.yaml"), @@ -88,8 +82,8 @@ func main() { func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: - fmt.Println("Inserting: ", d) - _, err := idx.Put("", d) + fmt.Println("Inserting: ", d.ID(), d) + _, err := idx.Put(d.ID(), d) return err default: return fmt.Errorf("type %T not supported", d) diff --git a/api/internal/crawl/config/crawler/cronjob/cronjob.yaml b/api/internal/crawl/config/crawler/cronjob/cronjob.yaml index af4308d7b..ab333c782 100644 --- a/api/internal/crawl/config/crawler/cronjob/cronjob.yaml +++ b/api/internal/crawl/config/crawler/cronjob/cronjob.yaml @@ -12,6 +12,7 @@ spec: containers: - name: crawler image: gcr.io/kustomize-search/crawler:latest + imagePullPolicy: Always env: - name: GITHUB_ACCESS_TOKEN valueFrom: diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index 8c319ded4..dde0de398 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -9,6 +9,7 @@ spec: containers: - name: crawler image: gcr.io/kustomize-search/crawler:latest + imagePullPolicy: Always env: - name: GITHUB_ACCESS_TOKEN valueFrom: diff --git a/api/internal/crawl/config/webapp/backend/deployment.yaml b/api/internal/crawl/config/webapp/backend/deployment.yaml index e533cc227..b08ac1a24 100644 --- a/api/internal/crawl/config/webapp/backend/deployment.yaml +++ b/api/internal/crawl/config/webapp/backend/deployment.yaml @@ -17,6 +17,7 @@ spec: containers: - name: kustomize-search image: gcr.io/kustomize-search/backend:latest + imagePullPolicy: Always livenessProbe: httpGet: path: /liveness diff --git a/api/internal/crawl/config/webapp/frontend/deployment.yaml b/api/internal/crawl/config/webapp/frontend/deployment.yaml index 75169dd16..ced036a42 100644 --- a/api/internal/crawl/config/webapp/frontend/deployment.yaml +++ b/api/internal/crawl/config/webapp/frontend/deployment.yaml @@ -17,6 +17,7 @@ spec: containers: - name: frontend image: gcr.io/kustomize-search/frontend:latest + imagePullPolicy: Always ports: - name: frontend-port containerPort: 80 diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index b2da17944..a18d670ed 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -102,11 +102,9 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, } doCrawl := func(docsPtr *CrawlSeed) { - for len(*docsPtr) > 0 { - back := len(*docsPtr) - 1 - next := (*docsPtr)[back] - *docsPtr = (*docsPtr)[:back] - + n := len(*docsPtr) + for i := 0; i < n; i++ { + next := (*docsPtr)[i] match := findMatch(next) if match == nil { logIfErr(fmt.Errorf( @@ -114,24 +112,28 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, continue } + logger.Println("Crawling ", next.RepositoryURL, next.FilePath) err := match.FetchDocument(ctx, next) logIfErr(err) // If there was no change or there is an error, we don't have // to branch out, since the dependencies are already in the // index, or we cannot find the document. if err != nil || next.WasCached() { + if next.WasCached() { + logger.Println(next.RepositoryURL, next.FilePath, "is cached already") + } continue } + logIfErr(match.SetCreated(ctx, next)) + cdoc, err := conv(next) logIfErr(err) - if err != nil { - continue - } addBranches(cdoc, match) } } + // Exploit seed to update bulk of corpus. logger.Printf("updating %d documents from seed\n", len(seed)) doCrawl(&seed) diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 576ba302f..e80428ddb 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -29,7 +29,7 @@ type testCrawler struct { } func (c testCrawler) Match(d *doc.Document) bool { - return d != nil && strings.HasPrefix(d.ID(), c.matchPrefix) + return d != nil } func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error { diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 3bea81f57..1184171de 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -133,8 +133,12 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error } func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error { - fs := GhFileSpec{} - fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath + fs := GhFileSpec{ + Path: d.FilePath, + Repository: GitRepository{ + FullName: d.RepositoryFullName(), + }, + } creationTime, err := gc.client.GetFileCreationTime(fs) if err != nil { return err @@ -185,9 +189,9 @@ func processQuery(ctx context.Context, gcl GhClient, query string, for _, file := range page.Parsed.Items { k, err := kustomizationResultAdapter(gcl, file) if err != nil { + logger.Printf("kustomizationResultAdapter failed: %v", err) errs = append(errs, err) errorCnt++ - continue } output <- k totalCnt++ @@ -224,6 +228,18 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) ( RepositoryURL: k.Repository.URL, }, } + logger.Printf("Set the creationTime field") + creationTime, err := gcl.GetFileCreationTime(k) + if err != nil { + logger.Printf("GetFileCreationTime failed: %v", err) + return &d, err + } + d.CreationTime = &creationTime + + if err := d.ParseYAML(); err != nil { + logger.Printf("ParseYAML failed: %v", err) + return &d, err + } return &d, nil } @@ -410,13 +426,15 @@ func (e multiError) Error() string { return strings.Join(strs, "\n") } +type GitRepository struct { + API string `json:"url,omitempty"` + URL string `json:"html_url,omitempty"` + FullName string `json:"full_name,omitempty"` +} + type GhFileSpec struct { - Path string `json:"path,omitempty"` - Repository struct { - API string `json:"url,omitempty"` - URL string `json:"html_url,omitempty"` - FullName string `json:"full_name,omitempty"` - } `json:"repository,omitempty"` + Path string `json:"path,omitempty"` + Repository GitRepository `json:"repository,omitempty"` } type githubResponse struct { diff --git a/api/internal/crawl/crawler/github/queries.go b/api/internal/crawl/crawler/github/queries.go index 7aa4e1531..d49252ec9 100644 --- a/api/internal/crawl/crawler/github/queries.go +++ b/api/internal/crawl/crawler/github/queries.go @@ -7,7 +7,7 @@ import ( ) const ( - perPageArg = "per_page" + perPageArg = "per_page" ) const githubMaxPageSize = 100 diff --git a/api/internal/crawl/crawler/github/queries_test.go b/api/internal/crawl/crawler/github/queries_test.go index 217ee79e8..d09acfe6b 100644 --- a/api/internal/crawl/crawler/github/queries_test.go +++ b/api/internal/crawl/crawler/github/queries_test.go @@ -68,7 +68,7 @@ func TestQueryType(t *testing.T) { func TestGithubSearchQuery(t *testing.T) { const ( - perPage = 100 + perPage = 100 ) testCases := []struct { @@ -82,7 +82,7 @@ func TestGithubSearchQuery(t *testing.T) { }{ { rc: RequestConfig{ - perPage: perPage, + perPage: perPage, }, codeQuery: Query{ Filename("kustomization.yaml"), diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 4c3eb1c6f..5e4af4fc4 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -44,9 +44,9 @@ type KustomizationDocument struct { type set map[string]struct{} func (doc *KustomizationDocument) String() string { - return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath, - doc.DefaultBranch, doc.CreationTime, doc.IsSame, - doc.Kinds, doc.Identifiers, doc.Values) + return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v", + doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime, + doc.IsSame, doc.Kinds, len(doc.Identifiers), len(doc.Values)) } // Implements the CrawlerDocument interface. diff --git a/api/internal/crawl/doc/docname.go b/api/internal/crawl/doc/docname.go index 44fef0236..ee05b2d3f 100644 --- a/api/internal/crawl/doc/docname.go +++ b/api/internal/crawl/doc/docname.go @@ -1,7 +1,10 @@ package doc import ( + "crypto/sha256" + "fmt" "path" + "strings" "time" "sigs.k8s.io/kustomize/api/internal/git" @@ -21,6 +24,17 @@ func (doc *Document) GetDocument() *Document { return doc } +func (doc *Document) Copy() *Document { + return &Document{ + RepositoryURL: doc.RepositoryURL, + FilePath: doc.FilePath, + DefaultBranch: doc.DefaultBranch, + DocumentData: doc.DocumentData, + CreationTime: doc.CreationTime, + IsSame: doc.IsSame, + } +} + // Implements the CrawlerDocument interface. func (doc *Document) WasCached() bool { return doc.IsSame @@ -53,6 +67,22 @@ func (doc *Document) FromRelativePath(newFile string) (Document, error) { } func (doc *Document) ID() string { - return doc.RepositoryURL + "/" + - doc.DefaultBranch + "/" + doc.FilePath + sum := sha256.Sum256([]byte(strings.Join( + []string{ + doc.RepositoryURL, + doc.DefaultBranch, + doc.FilePath, + }, + "---|---"))) + return fmt.Sprintf("%x", sum) +} + +func (doc *Document) RepositoryFullName() string { + doc.RepositoryURL = strings.TrimRight(doc.RepositoryURL, "/") + sections := strings.Split(doc.RepositoryURL, "/") + l := len(sections) + if l < 2 { + return doc.RepositoryURL + } + return path.Join(sections[l-2], sections[l-1]) } diff --git a/api/internal/crawl/doc/docname_test.go b/api/internal/crawl/doc/docname_test.go index b387aa7a3..afcd702cb 100644 --- a/api/internal/crawl/doc/docname_test.go +++ b/api/internal/crawl/doc/docname_test.go @@ -62,3 +62,44 @@ func TestFromRelativePath(t *testing.T) { } } } + +func TestDocument_RepositoryFullName(t *testing.T) { + testCases := []struct { + doc Document + expectedRepositoryFullName string + }{ + { + doc: Document{ + RepositoryURL: "https://github.com/user/repo", + }, + expectedRepositoryFullName: "user/repo", + }, + { + doc: Document{ + RepositoryURL: "https://github.com//user/repo////", + }, + expectedRepositoryFullName: "user/repo", + }, + { + doc: Document{ + RepositoryURL: "repo/", + }, + expectedRepositoryFullName: "repo", + }, + { + doc: Document{ + RepositoryURL: "", + }, + expectedRepositoryFullName: "", + }, + } + + for _, tc := range testCases { + returnedRepositoryFullName := tc.doc.RepositoryFullName() + if returnedRepositoryFullName != tc.expectedRepositoryFullName { + t.Errorf("RepositoryFullName expected %s, got %s", + tc.expectedRepositoryFullName, + returnedRepositoryFullName) + } + } +} \ No newline at end of file