From e44d1298dfc49eb9415f59d56860d48e429f0144 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 10:51:40 -0800 Subject: [PATCH 01/12] Return errors if http Client.Do resp status code is not 2xx --- api/internal/crawl/crawler/github/crawler.go | 26 +++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 66fa9b20c..78e9e8411 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -115,18 +115,18 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error d.FilePath = d.FilePath + path return nil } + return err } - resp, err := gc.client.GetRawUserContent(url) - if err := handle(resp, err, ""); err == nil { + resp, errGetRawUserContent := gc.client.GetRawUserContent(url) + if err := handle(resp, errGetRawUserContent, ""); err == nil { return nil } for _, file := range konfig.RecognizedKustomizationFileNames() { - resp, err = gc.client.GetRawUserContent(url + "/" + file) - err := handle(resp, err, "/"+file) - if err != nil { - continue + resp, errGetRawUserContent = gc.client.GetRawUserContent(url + "/" + file) + if err = handle(resp, errGetRawUserContent, "/"+file); err == nil { + return nil } } return fmt.Errorf("file not found: %s, error: %v", url, err) @@ -559,7 +559,15 @@ func (gcl GhClient) Do(query string) (*http.Response, error) { return nil, err } req.Header.Add("Authorization", fmt.Sprintf("token %s", gcl.accessToken)) - return gcl.client.Do(req) + + // gcl.client.Do: a non-2xx status code doesn't cause an error. + // See https://golang.org/pkg/net/http/#Client.Do for more info. + resp, err := gcl.client.Do(req) + if resp.StatusCode != http.StatusOK { + err = fmt.Errorf("GhClient.Do(%s) failed with response code: %d", + query, resp.StatusCode) + } + return resp, err } func (gcl GhClient) getWithRetry( @@ -574,8 +582,8 @@ func (gcl GhClient) getWithRetry( retryCount > 0 { retryTime := resp.Header.Get("Retry-After") - i, err := strconv.Atoi(retryTime) - if err != nil { + i, errAtoi := strconv.Atoi(retryTime) + if errAtoi != nil { return resp, fmt.Errorf( "query '%s' forbidden without 'Retry-After'", query) } From 12fc8f41c74ff7794491de4e26acf60901549263 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 11:08:38 -0800 Subject: [PATCH 02/12] Add support for github paths starting with "git@github.com:" --- api/internal/crawl/doc/docname.go | 12 +++++++++--- api/internal/crawl/doc/docname_test.go | 6 ++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/api/internal/crawl/doc/docname.go b/api/internal/crawl/doc/docname.go index ee05b2d3f..5afca1290 100644 --- a/api/internal/crawl/doc/docname.go +++ b/api/internal/crawl/doc/docname.go @@ -78,11 +78,17 @@ func (doc *Document) ID() string { } func (doc *Document) RepositoryFullName() string { - doc.RepositoryURL = strings.TrimRight(doc.RepositoryURL, "/") - sections := strings.Split(doc.RepositoryURL, "/") + url := strings.TrimRight(doc.RepositoryURL, "/") + + gitPrefix := "git@github.com:" + if strings.HasPrefix(url, gitPrefix) { + url = url[len(gitPrefix):] + } + + sections := strings.Split(url, "/") l := len(sections) if l < 2 { - return doc.RepositoryURL + return url } return path.Join(sections[l-2], sections[l-1]) } diff --git a/api/internal/crawl/doc/docname_test.go b/api/internal/crawl/doc/docname_test.go index afcd702cb..f1b65dc8f 100644 --- a/api/internal/crawl/doc/docname_test.go +++ b/api/internal/crawl/doc/docname_test.go @@ -92,6 +92,12 @@ func TestDocument_RepositoryFullName(t *testing.T) { }, expectedRepositoryFullName: "", }, + { + doc: Document{ + RepositoryURL: "git@github.com:user/repo", + }, + expectedRepositoryFullName: "user/repo", + }, } for _, tc := range testCases { From 8c89f0946c452aacb2b0308b858ae7dcbda48788 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 11:39:54 -0800 Subject: [PATCH 03/12] Avoid to index a document if FetchDcoument or SetCreated fails --- api/internal/crawl/crawler/crawler.go | 36 +++++++++++--------- api/internal/crawl/crawler/github/crawler.go | 3 ++ api/internal/crawl/doc/doc.go | 2 ++ 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index f57f247e5..410c3454b 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -72,17 +72,17 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen[cdoc.ID()] = struct{}{} // Insert into index - err := indx(cdoc, match) - logIfErr(err) - if err != nil { + if err := indx(cdoc, match); err != nil { + logger.Println("Failed to index: ", err) return } deps, err := cdoc.GetResources() - logIfErr(err) if err != nil { + logger.Println(err) return } + for _, dep := range deps { if _, ok := seen[dep.ID()]; ok { continue @@ -107,29 +107,33 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C } docCount++ + if tail.WasCached() { + logger.Printf("%s %s is cached already", tail.RepositoryURL, tail.FilePath) + continue + } + match := findMatch(tail, crawlers) if match == nil { - logIfErr(fmt.Errorf( - "%v could not match any crawler", tail)) + logIfErr(fmt.Errorf("%v could not match any crawler", tail)) continue } logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath) - err := match.FetchDocument(ctx, tail) - logIfErr(err) - // If there was no change or there is an error, we don't have - // to branch out, since the dependencies are already in the - // index, or we cannot find the document. - if err != nil || tail.WasCached() { - if tail.WasCached() { - logger.Println(tail.RepositoryURL, tail.FilePath, "is cached already") - } + if err := match.FetchDocument(ctx, tail); err != nil { + logger.Printf("FetchDocument failed on %s %s: %v", + tail.RepositoryURL, tail.FilePath, err) continue } - logIfErr(match.SetCreated(ctx, tail)) + if err := match.SetCreated(ctx, tail); err != nil { + logger.Printf("SetCreated failed on %s %s: %v", + tail.RepositoryURL, tail.FilePath, err) + continue + } cdoc, err := conv(tail) + // If conv returns an error, cdoc can still be added into the index so that + // cdoc.Document can be searched. logIfErr(err) addBranches(cdoc, match, indx, seen, stack) diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 78e9e8411..83a118553 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -93,6 +93,9 @@ func (gc githubCrawler) Crawl( return nil } +// FetchDocument first tries to fetch the document with d.FilePath. If it fails, +// it will try to add each string in konfig.RecognizedKustomizationFileNames() to +// d.FilePath, and try to fetch the document again. func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error { repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch repoSpec, err := git.NewRepoSpecFromUrl(repoURL) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 5e4af4fc4..50241e5ad 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -116,6 +116,8 @@ func (doc *KustomizationDocument) readBytes() ([]map[string]interface{}, error) return configs, nil } +// ParseYAML parses doc.Document and sets the following fields of doc: +// Kinds, Values, Identifiers. func (doc *KustomizationDocument) ParseYAML() error { doc.Identifiers = make([]string, 0) doc.Values = make([]string, 0) From 5598d35e4b8bc54a66ad9606345df1808e0c6390 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 12:13:27 -0800 Subject: [PATCH 04/12] Add a summary for doCrawl --- api/internal/crawl/crawler/crawler.go | 32 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 410c3454b..30d95ec0f 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -93,7 +93,15 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, seen map[string]struct{}, stack *CrawlSeed) { - docCount := 0 + + UpdatedDocCount := 0 + seenDocCount := 0 + cachedDocCount := 0 + findMatchErrCount := 0 + FetchDocumentErrCount := 0 + SetCreatedErrCount := 0 + convErrCount := 0 + // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { // get the last Document in (*docPtr), which will be crawled in this iteration. @@ -103,18 +111,20 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] if _, ok := seen[tail.ID()]; ok { + seenDocCount++ continue } - docCount++ if tail.WasCached() { logger.Printf("%s %s is cached already", tail.RepositoryURL, tail.FilePath) + cachedDocCount++ continue } match := findMatch(tail, crawlers) if match == nil { logIfErr(fmt.Errorf("%v could not match any crawler", tail)) + findMatchErrCount++ continue } @@ -122,23 +132,37 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) + FetchDocumentErrCount++ continue } if err := match.SetCreated(ctx, tail); err != nil { logger.Printf("SetCreated failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) + SetCreatedErrCount++ continue } cdoc, err := conv(tail) // If conv returns an error, cdoc can still be added into the index so that // cdoc.Document can be searched. - logIfErr(err) + if err != nil { + logger.Printf("conv failed on %s %s: %v", + tail.RepositoryURL, tail.FilePath, err) + convErrCount++ + } + UpdatedDocCount++ addBranches(cdoc, match, indx, seen, stack) } - logger.Printf("%d documents were crawled by doCrawl\n", docCount) + logger.Printf("Summary of doCrawl:\n") + logger.Printf("\t%d documents were updated\n", UpdatedDocCount) + logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount) + logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount) + logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount) + logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount) + logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount) + logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount) } // CrawlFromSeed updates all the documents in seed, and crawls all the new From 272b7a6fcd7fd06fe4a9da5bbcbca4e63df9d4b7 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 12:39:13 -0800 Subject: [PATCH 05/12] Use `UpdateRequest` to insert/update a document Currently, `IndexRequest` is used to insert/update a document, which increases the version of the document every time IndexRequest.Do is called. --- api/internal/crawl/index/elasticsearch.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index 8893dc2e8..2696d8dfd 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -180,12 +180,15 @@ func (idx *index) DeleteIndex() error { // Insert or update the document by ID. func (idx *index) Put(uniqueID string, doc interface{}) (string, error) { - body, err := json.Marshal(doc) + docBytes, err := json.Marshal(doc) if err != nil { return "", err } + body := byteJoin(`{"doc":`, docBytes, `}`) - req := esapi.IndexRequest{ + // Use `UpdateRequest` here instead of `IndexRequest`. + // For a document with a given id, every call of IndexRequest.Do will increase the version of a document. + req := esapi.UpdateRequest{ Index: idx.name, Body: bytes.NewReader(body), DocumentID: uniqueID, From 1eb713157ccc0fee511aa2089ba5cd5e32854770 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Mon, 16 Dec 2019 15:13:59 -0800 Subject: [PATCH 06/12] Sort the string slice fields of a document to avoid updating the index unnecessarily --- api/internal/crawl/doc/doc.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 50241e5ad..6ab581458 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -2,6 +2,7 @@ package doc import ( "fmt" + "sort" "strings" "sigs.k8s.io/kustomize/api/k8sdeps/kunstruct" @@ -161,6 +162,13 @@ func (doc *KustomizationDocument) ParseYAML() error { doc.Identifiers = append(doc.Identifiers, key) } + // Without sorting these fields, every time when the string order in these fields changes, + // the document in the index will be updated. + // Sorting these fields are necessary to avoid a document being updated unnecessarily. + sort.Strings(doc.Kinds) + sort.Strings(doc.Values) + sort.Strings(doc.Identifiers) + return nil } From 2c2aa928cc17c9fa2b4a17dcc757b72840760683 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 09:32:11 -0800 Subject: [PATCH 07/12] Delete non-existing documents from the index --- api/internal/crawl/cmd/crawler/crawler.go | 14 +++++++--- api/internal/crawl/crawler/crawler.go | 27 +++++++++++++++----- api/internal/crawl/crawler/crawler_test.go | 3 ++- api/internal/crawl/crawler/github/crawler.go | 5 +--- api/internal/crawl/index/kustomize.go | 11 ++++++++ 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 7e14f7072..c4e913706 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -65,12 +65,18 @@ func main() { } // Index updates the value in the index. - index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { + index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: - fmt.Println("Inserting: ", d) - _, err := idx.Put(d.ID(), d) - return err + switch mode { + case index.Delete: + fmt.Println("Deleting: ", d) + return idx.Delete(d.ID()) + default: + fmt.Println("Inserting: ", d) + _, err := idx.Put(d.ID(), d) + return err + } default: return fmt.Errorf("type %T not supported", d) } diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 30d95ec0f..49c2403d1 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -8,6 +8,7 @@ import ( "fmt" "log" "os" + "sigs.k8s.io/kustomize/api/internal/crawl/index" "sync" _ "github.com/gomodule/redigo/redis" @@ -47,7 +48,7 @@ type CrawledDocument interface { type CrawlSeed []*doc.Document -type IndexFunc func(CrawledDocument, Crawler) error +type IndexFunc func(CrawledDocument, Crawler, index.Mode) error type Converter func(*doc.Document) (CrawledDocument, error) func logIfErr(err error) { @@ -72,8 +73,9 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen[cdoc.ID()] = struct{}{} // Insert into index - if err := indx(cdoc, match); err != nil { - logger.Println("Failed to index: ", err) + if err := indx(cdoc, match, index.InsertOrUpdate); err != nil { + logger.Printf("Failed to insert or update %s %s: %v", + cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err) return } @@ -101,6 +103,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C FetchDocumentErrCount := 0 SetCreatedErrCount := 0 convErrCount := 0 + deleteDocCount := 0 // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { @@ -133,6 +136,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) FetchDocumentErrCount++ + // delete the document from the index + cdoc := &doc.KustomizationDocument{ + Document: *tail, + } + seen[cdoc.ID()] = struct{}{} + if err := indx(cdoc, match, index.Delete); err != nil { + logger.Printf("Failed to delete %s %s: %v", + cdoc.RepositoryURL, cdoc.FilePath, err) + } + deleteDocCount++ continue } @@ -140,7 +153,6 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("SetCreated failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) SetCreatedErrCount++ - continue } cdoc, err := conv(tail) @@ -160,9 +172,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount) logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount) logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount) - logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount) - logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount) - logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount) + logger.Printf("\t%d documents cannot be fetched, %d out of them are deleted\n", + FetchDocumentErrCount, deleteDocCount) + logger.Printf("\t%d documents cannot update its creation time but still were inserted or updated in the index\n", SetCreatedErrCount) + logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount) } // CrawlFromSeed updates all the documents in seed, and crawls all the new diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 64887b1fc..ec479facf 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "reflect" + "sigs.k8s.io/kustomize/api/internal/crawl/index" "sort" "strings" "sync" @@ -316,7 +317,7 @@ resources: Document: *d, }, nil }, - func(d CrawledDocument, cr Crawler) error { + func(d CrawledDocument, cr Crawler, mode index.Mode) error { visited[d.ID()]++ return nil }, diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 83a118553..fec1628b0 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -580,10 +580,7 @@ func (gcl GhClient) getWithRetry( retryCount := gcl.retryCount - for err == nil && - resp.StatusCode == http.StatusForbidden && - retryCount > 0 { - + for resp.StatusCode == http.StatusForbidden && retryCount > 0 { retryTime := resp.Header.Get("Retry-After") i, errAtoi := strconv.Atoi(retryTime) if errAtoi != nil { diff --git a/api/internal/crawl/index/kustomize.go b/api/internal/crawl/index/kustomize.go index 79833d3fd..430142fc3 100644 --- a/api/internal/crawl/index/kustomize.go +++ b/api/internal/crawl/index/kustomize.go @@ -16,6 +16,12 @@ const ( AggregationKeyword = "aggs" ) +type Mode int +const ( + InsertOrUpdate = iota + Delete +) + // Redefinition of Hits structure. Must match the json string of // KustomizeResult.Hits.Hits. Declared as a convenience for iteration. type KustomizeHits []struct { @@ -301,6 +307,11 @@ func (ki *KustomizeIndex) Put(id string, doc *doc.KustomizationDocument) (string return id, nil } +// Delete a document with a given id from the kustomize index. +func (ki *KustomizeIndex) Delete(id string) error { + return ki.index.Delete(id) +} + // Kustomize search options: What metrics should be returned? Kind Aggregation, // TimeseriesAggregation, etc. Also embedds the SearchOptions field to specify // the position in the sorted list of results and the number of results to return. From bef157d6b38d15917b81567429f20718b995a880 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 12:14:41 -0800 Subject: [PATCH 08/12] Fix insert/updating document logic --- api/internal/crawl/cmd/crawler/crawler.go | 9 ++- api/internal/crawl/index/elasticsearch.go | 82 ++++++++++++++--------- api/internal/crawl/index/kustomize.go | 8 +-- 3 files changed, 57 insertions(+), 42 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index c4e913706..ea73efd04 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -65,7 +65,7 @@ func main() { } // Index updates the value in the index. - index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { + indexFunc := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: switch mode { @@ -74,8 +74,7 @@ func main() { return idx.Delete(d.ID()) default: fmt.Println("Inserting: ", d) - _, err := idx.Put(d.ID(), d) - return err + return idx.Put(d.ID(), d) } default: return fmt.Errorf("type %T not supported", d) @@ -123,6 +122,6 @@ func main() { } crawlers := []crawler.Crawler{ghCrawler} - crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, index, seen) - crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen) + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) } diff --git a/api/internal/crawl/index/elasticsearch.go b/api/internal/crawl/index/elasticsearch.go index 2696d8dfd..b80d7f901 100644 --- a/api/internal/crawl/index/elasticsearch.go +++ b/api/internal/crawl/index/elasticsearch.go @@ -6,7 +6,6 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "time" es "github.com/elastic/go-elasticsearch/v6" @@ -179,47 +178,47 @@ func (idx *index) DeleteIndex() error { } // Insert or update the document by ID. -func (idx *index) Put(uniqueID string, doc interface{}) (string, error) { - docBytes, err := json.Marshal(doc) +func (idx *index) Put(uniqueID string, doc interface{}) error { + exists, err := idx.Exists(uniqueID) if err != nil { - return "", err + return err } - body := byteJoin(`{"doc":`, docBytes, `}`) - // Use `UpdateRequest` here instead of `IndexRequest`. - // For a document with a given id, every call of IndexRequest.Do will increase the version of a document. - req := esapi.UpdateRequest{ - Index: idx.name, - Body: bytes.NewReader(body), - DocumentID: uniqueID, - } - res, err := req.Do(idx.ctx, idx.client) - - var id string - readId := func(reader io.Reader) error { - type InsertResult struct { - ID string `json:"_id,omitempty"` + if exists { + docBytes, err := json.Marshal(doc) + if err != nil { + return err } - var ir InsertResult - data, err := ioutil.ReadAll(reader) + body := byteJoin(`{"doc":`, docBytes, `}`) + + // For a document with a given id, every call of IndexRequest.Do will increase the version of a document. + // To avoid increasing the document version unnecessarily, use UpdateRequest here. + req := esapi.UpdateRequest{ + Index: idx.name, + Body: bytes.NewReader(body), + DocumentID: uniqueID, + } + res, err := req.Do(idx.ctx, idx.client) + + err = idx.responseErrorOrNil("could not update document", + res, err, ignoreResponseBody) + } else { + body, err := json.Marshal(doc) if err != nil { return err } - err = json.Unmarshal(data, &ir) - if err != nil { - return err + req := esapi.IndexRequest{ + Index: idx.name, + Body: bytes.NewReader(body), + DocumentID: uniqueID, } - id = ir.ID + res, err := req.Do(idx.ctx, idx.client) - return nil + err = idx.responseErrorOrNil("could not insert document", + res, err, ignoreResponseBody) } - - // populates the id field. - err = idx.responseErrorOrNil("could not insert document", - res, err, readId) - - return id, err + return err } type scrollUpdater func(string, readerFunc) error @@ -299,3 +298,24 @@ func (idx *index) Delete(id string) error { fmt.Sprintf("could not delete id(%s) from index(%s)", id, idx.name), res, err, ignoreResponseBody) } + +// Check whether a given document id is in the index +func (idx *index) Exists(id string) (bool, error) { + op := idx.client.Exists + res, err := op( + idx.name, + id, + op.WithContext(idx.ctx), + op.WithPretty(), + ) + + if !res.IsError() { + return true, nil + } else if res.StatusCode == 404 { + return false, nil + } else { + return false, idx.responseErrorOrNil( + fmt.Sprintf("could not check the existence of id(%s) from index(%s)", id, idx.name), + res, err, ignoreResponseBody) + } +} diff --git a/api/internal/crawl/index/kustomize.go b/api/internal/crawl/index/kustomize.go index 430142fc3..cedea28bb 100644 --- a/api/internal/crawl/index/kustomize.go +++ b/api/internal/crawl/index/kustomize.go @@ -299,12 +299,8 @@ func (ki *KustomizeIndex) IterateQuery(query []byte, batchSize int, } // type specific Put for inserting structured kustomization documents. -func (ki *KustomizeIndex) Put(id string, doc *doc.KustomizationDocument) (string, error) { - id, err := ki.index.Put(id, doc) - if err != nil { - return id, fmt.Errorf("could not insert in elastic: %v", err) - } - return id, nil +func (ki *KustomizeIndex) Put(id string, doc *doc.KustomizationDocument) error { + return ki.index.Put(id, doc) } // Delete a document with a given id from the kustomize index. From a35f00213914802c6e53b3d464640b3485597d3c Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 12:31:23 -0800 Subject: [PATCH 09/12] Run `goimports` --- api/internal/crawl/crawler/crawler.go | 5 +++-- api/internal/crawl/crawler/crawler_test.go | 3 ++- api/internal/crawl/doc/doc.go | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 49c2403d1..d46cf161d 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -8,9 +8,10 @@ import ( "fmt" "log" "os" - "sigs.k8s.io/kustomize/api/internal/crawl/index" "sync" + "sigs.k8s.io/kustomize/api/internal/crawl/index" + _ "github.com/gomodule/redigo/redis" "sigs.k8s.io/kustomize/api/internal/crawl/doc" @@ -138,7 +139,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C FetchDocumentErrCount++ // delete the document from the index cdoc := &doc.KustomizationDocument{ - Document: *tail, + Document: *tail, } seen[cdoc.ID()] = struct{}{} if err := indx(cdoc, match, index.Delete); err != nil { diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index ec479facf..00a619c46 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -5,13 +5,14 @@ import ( "errors" "fmt" "reflect" - "sigs.k8s.io/kustomize/api/internal/crawl/index" "sort" "strings" "sync" "testing" "time" + "sigs.k8s.io/kustomize/api/internal/crawl/index" + "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/konfig" ) diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 6ab581458..953f8d4b4 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -47,7 +47,7 @@ type set map[string]struct{} func (doc *KustomizationDocument) String() string { return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v", doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime, - doc.IsSame, doc.Kinds, len(doc.Identifiers), len(doc.Values)) + doc.IsSame, doc.Kinds, len(doc.Identifiers), len(doc.Values)) } // Implements the CrawlerDocument interface. From f5ff254203320e2f02ed8f917ff1f432539cfedf Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 12:37:34 -0800 Subject: [PATCH 10/12] Update deps --- api/internal/crawl/go.mod | 4 +-- api/internal/crawl/go.sum | 57 +++++++++------------------------------ 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/api/internal/crawl/go.mod b/api/internal/crawl/go.mod index cabc30d5b..b8ddb8140 100644 --- a/api/internal/crawl/go.mod +++ b/api/internal/crawl/go.mod @@ -1,9 +1,9 @@ module sigs.k8s.io/kustomize/api/internal/crawl -go 1.13 +go 1.12 require ( - github.com/elastic/go-elasticsearch/v6 v6.8.2 + github.com/elastic/go-elasticsearch/v6 v6.8.5 github.com/gomodule/redigo v2.0.0+incompatible github.com/gorilla/mux v1.7.3 github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 diff --git a/api/internal/crawl/go.sum b/api/internal/crawl/go.sum index 16214211e..9fb6aa083 100644 --- a/api/internal/crawl/go.sum +++ b/api/internal/crawl/go.sum @@ -2,14 +2,10 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI= -github.com/Azure/go-autorest/autorest v0.9.2/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI= github.com/Azure/go-autorest/autorest/adal v0.5.0/go.mod h1:8Z9fGy2MpX0PvDjB1pEgQTmVqjGhiHBW7RJJEciWzS0= -github.com/Azure/go-autorest/autorest/adal v0.8.0/go.mod h1:Z6vX6WXXuyieHAXwMj0S6HY6e6wcHn37qQMBQlvY3lc= github.com/Azure/go-autorest/autorest/date v0.1.0/go.mod h1:plvfp3oPSKwf2DNjlBjWF/7vwR+cUD/ELuzDCXwHUVA= -github.com/Azure/go-autorest/autorest/date v0.2.0/go.mod h1:vcORJHLJEh643/Ioh9+vPmf1Ij9AEBM5FuBIXLmIy0g= github.com/Azure/go-autorest/autorest/mocks v0.1.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= github.com/Azure/go-autorest/autorest/mocks v0.2.0/go.mod h1:OTyCOPRA2IgIlWxVYxBee2F5Gr4kF2zd2J5cFRaIDN0= -github.com/Azure/go-autorest/autorest/mocks v0.3.0/go.mod h1:a8FDP3DYzQ4RYfVAxAN3SVSiiO77gL2j2ronKKP0syM= github.com/Azure/go-autorest/logger v0.1.0/go.mod h1:oExouG+K6PryycPJfVSxi/koC6LSNgds39diKLz7Vrc= github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbtp2fGCgRFtBroKn4Dk= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= @@ -44,14 +40,10 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docker/spdystream v0.0.0-20160310174837-449fdfce4d96/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM= -github.com/docker/spdystream v0.0.0-20181023171402-6480d4af844c/go.mod h1:Qh8CwZgvJUkLughtfhJv5dyTYa91l1fOUCrgjqmcifM= -github.com/elastic/go-elasticsearch/v6 v6.8.2 h1:rp5DGrd63V5c6nHLjF6QEXUpZSvs0+QM3ld7m9VhV2g= -github.com/elastic/go-elasticsearch/v6 v6.8.2/go.mod h1:UwaDJsD3rWLM5rKNFzv9hgox93HoX8utj1kxD9aFUcI= +github.com/elastic/go-elasticsearch/v6 v6.8.5 h1:U2HtkBseC1FNBmDr0TR2tKltL6FxoY+niDAlj5M8TK8= +github.com/elastic/go-elasticsearch/v6 v6.8.5/go.mod h1:UwaDJsD3rWLM5rKNFzv9hgox93HoX8utj1kxD9aFUcI= github.com/elazarl/goproxy v0.0.0-20170405201442-c4fc26588b6e/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= -github.com/elazarl/goproxy v0.0.0-20191011121108-aa519ddbe484/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM= -github.com/elazarl/goproxy/ext v0.0.0-20190711103511-473e67f1d7d2/go.mod h1:gNh8nYJoAm43RfaxurUnxr+N1PwuFV3ZMl/efxlIlY8= github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= -github.com/emicklei/go-restful v2.9.6+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/evanphx/json-patch v4.2.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch v4.5.0+incompatible h1:ouOWdg56aJriqS0huScTkVXPC5IcNrDCXZ6OoTAWu7M= github.com/evanphx/json-patch v4.5.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= @@ -95,13 +87,11 @@ github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJA github.com/gofrs/flock v0.0.0-20190320160742-5135e617513b/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= +github.com/gogo/protobuf v1.2.2-0.20190723190241-65acae22fc9d h1:3PaI8p3seN09VjbTYC/QWlUZdZ1qS1zGjy7LH2Wt07I= github.com/gogo/protobuf v1.2.2-0.20190723190241-65acae22fc9d/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= -github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls= -github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= -github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -115,12 +105,9 @@ github.com/golangci/errcheck v0.0.0-20181223084120-ef45e06d44b6/go.mod h1:DbHgvL github.com/golangci/go-misc v0.0.0-20180628070357-927a3d87b613/go.mod h1:SyvUF2NxV+sN8upjjeVYr5W7tyxaT1JVtvhKhOn2ii8= github.com/golangci/goconst v0.0.0-20180610141641-041c5f2b40f3/go.mod h1:JXrF4TWy4tXYn62/9x8Wm/K/dm06p8tCKwFRDPZG/1o= github.com/golangci/gocyclo v0.0.0-20180528134321-2becd97e67ee/go.mod h1:ozx7R9SIwqmqf5pRP90DhR2Oay2UIjGuKheCBCNwAYU= -github.com/golangci/gofmt v0.0.0-20181222123516-0b8337e80d98/go.mod h1:9qCChq59u/eW8im404Q2WWTrnBUQKjpNYKMbU4M7EFU= github.com/golangci/gofmt v0.0.0-20190930125516-244bba706f1a/go.mod h1:9qCChq59u/eW8im404Q2WWTrnBUQKjpNYKMbU4M7EFU= -github.com/golangci/golangci-lint v1.19.1/go.mod h1:2CEc4Fxx3vxDv7g8DyXkHCBF73AOzAymcJAprs2vCps= github.com/golangci/golangci-lint v1.21.0/go.mod h1:phxpHK52q7SE+5KpPnti4oZTdFCEsn/tKN+nFvCKXfk= github.com/golangci/ineffassign v0.0.0-20190609212857-42439a7714cc/go.mod h1:e5tpTHCfVze+7EpLEozzMB3eafxo2KT5veNg1k6byQU= -github.com/golangci/lint-1 v0.0.0-20190420132249-ee948d087217/go.mod h1:66R6K6P6VWk9I95jvqGxkqJxVWGFy9XlDwLwVz1RCFg= github.com/golangci/lint-1 v0.0.0-20191013205115-297bf364a8e0/go.mod h1:66R6K6P6VWk9I95jvqGxkqJxVWGFy9XlDwLwVz1RCFg= github.com/golangci/maligned v0.0.0-20180506175553-b1d89398deca/go.mod h1:tvlJhZqDe4LMs4ZHD0oMUlt9G2LWuDGoisJTBzLMV9o= github.com/golangci/misspell v0.0.0-20180809174111-950f5d19e770/go.mod h1:dEbvlSfYbMQDtrpRMQU675gSDLDNa8sCPPChZ7PhiVA= @@ -132,9 +119,8 @@ github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= -github.com/google/go-cmp v0.3.1 h1:Xye71clBPdm5HgqGwUkwhbynsUJZhDbS20FvLhQ2izg= -github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/gofuzz v0.0.0-20161122191042-44d81051d367/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -144,11 +130,9 @@ github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm4 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d h1:7XGaL1e6bYS1yIonGp9761ExpPPV1ui0SAC59Yube9k= github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= -github.com/googleapis/gnostic v0.3.0 h1:CcQijm0XKekKjP/YCz28LXVSpgguuB+nCxaSjCe09y0= -github.com/googleapis/gnostic v0.3.0/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= github.com/gophercloud/gophercloud v0.1.0/go.mod h1:vxM41WHh5uqHVBMZHzuwNOHh8XEoIEcSTewFxm1c5g8= -github.com/gophercloud/gophercloud v0.6.0/go.mod h1:GICNByuaEBibcjmjvI7QvYJSZEbGkcYwAR7EZK2WMqM= github.com/gorilla/context v0.0.0-20160226214623-1ea25387ff6f/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51q0aT7Yg= github.com/gorilla/mux v1.6.0/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.7.3 h1:gnP5JzjVOuiZD07fKKToCAOjS0yOpj/qPETTXCCS6hw= @@ -166,7 +150,6 @@ github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgf github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru v0.5.3/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= @@ -200,9 +183,7 @@ github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czP github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= -github.com/matoous/godox v0.0.0-20190910121045-032ad8106c86/go.mod h1:1BELzlh859Sh1c6+90blK8lbYy0kwQf1bYlBhBysy1s= github.com/matoous/godox v0.0.0-20190911065817-5d6d842e92eb/go.mod h1:1BELzlh859Sh1c6+90blK8lbYy0kwQf1bYlBhBysy1s= -github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/goveralls v0.0.2/go.mod h1:8d1ZMHsd7fW6IRPKQh46F2WRpyib5/X4FOpevwGNQEw= @@ -217,7 +198,6 @@ github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lN github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= -github.com/monopole/mdrip v1.0.0/go.mod h1:N1/ppRG9CaPeUKAUHZ3dUlfOT81lTpKZLkyhCvTETwM= github.com/monopole/mdrip v1.0.1/go.mod h1:/7E04hlzRG9Jrp6WILZfYYm/REoJWL2l+MlsCO1eH74= github.com/mozilla/tls-observatory v0.0.0-20190404164649-a3c1b6cfecfd/go.mod h1:SrKMQvPiws7F7iqYp8/TX+IhxCYhzr6N/1yb8cwHsGk= github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= @@ -251,13 +231,10 @@ github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7z github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/quasilyte/go-consistent v0.0.0-20190521200055-c6f3937de18c/go.mod h1:5STLWrekHfjyYwxBRVRXNOSewLJ3PWfDJd1VyTS21fI= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= -github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYeAmZ5ZIpBWTGllZSQnw97Dj+woV0toclVaRGI8pc= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= -github.com/russross/blackfriday v2.0.0+incompatible/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= -github.com/securego/gosec v0.0.0-20190912120752-140048b2a218/go.mod h1:q6oYAujd2qyeU4cJqIri4LBIgdHXGvxWHZ1E29HNFRE= github.com/securego/gosec v0.0.0-20191002120514-e680875ea14d/go.mod h1:w5+eXa0mYznDkHaMCXA4XYffjlH+cy1oyKbfzJXa2Do= github.com/shirou/gopsutil v0.0.0-20190901111213-e4ec7b275ada/go.mod h1:WWnYX4lzhCH5h/3YBfyVA3VbLYjlMZZAQcW9ojMexNc= github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= @@ -288,13 +265,11 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/timakin/bodyclose v0.0.0-20190721030226-87058b9bfcec/go.mod h1:Qimiffbc6q9tBWlVV6x0P9sat/ao1xEkREYPPj9hphk= github.com/timakin/bodyclose v0.0.0-20190930140734-f7f2e9bca95e/go.mod h1:Qimiffbc6q9tBWlVV6x0P9sat/ao1xEkREYPPj9hphk= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/ultraware/funlen v0.0.2/go.mod h1:Dp4UiAus7Wdb9KUZsYWZEWiRzGuM2kXM1lPbfaF6xhA= -github.com/ultraware/whitespace v0.0.3/go.mod h1:aVMh/gQve5Maj9hQ/hg+F75lr/X5A89uZnzAmWSineA= github.com/ultraware/whitespace v0.0.4/go.mod h1:aVMh/gQve5Maj9hQ/hg+F75lr/X5A89uZnzAmWSineA= github.com/uudashr/gocognit v0.0.0-20190926065955-1655d0de0517/go.mod h1:j44Ayx2KW4+oB6SWMv8KsmHzZrOInQav7D3cQMJ5JUM= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= @@ -315,7 +290,6 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20190911031432-227b76d455e7/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190923035154-9ee001bba392/go.mod h1:/lpIB1dKB+9EgE3H3cr1v9wB50oz8l4C4h62xy7jSTY= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -338,9 +312,8 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20190909003024-a7b16738d86b h1:XfVGCX+0T4WOStkaOsJRllbsiImhB2jgVBGc9L0lPGc= -golang.org/x/net v0.0.0-20190909003024-a7b16738d86b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191004110552-13f9640d40b9 h1:rjwSpXsdiK0dV8/Naq3kAw9ymfAeJIyd0upUIElB+lI= golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -364,8 +337,7 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20190911201528-7ad0cfa0b7b5 h1:SW/0nsKCUaozCUtZTakri5laocGx/5bkDSSLrFUsa5s= -golang.org/x/sys v0.0.0-20190911201528-7ad0cfa0b7b5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190922100055-0a153f010e69 h1:rOhMmluY6kLMhdnrivzec6lLgaVbMHMn2ISQXJeJ5EM= golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -374,7 +346,6 @@ golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -393,8 +364,6 @@ golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190719005602-e377ae9d6386/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= golang.org/x/tools v0.0.0-20190910044552-dd2b5c81c578/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20190911230505-6bfd74cf029c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20190912215617-3720d1ec3678/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190930201159-7c411dea38b0/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191010075000-0337d82405ff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -429,8 +398,11 @@ gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +k8s.io/api v0.17.0 h1:H9d/lw+VkZKEVIUc8F3wgiQ+FUXTTr21M87jXLU7yqM= k8s.io/api v0.17.0/go.mod h1:npsyOePkeP0CPwyGfXDHxvypiYMJxBWAMpQxCaJ4ZxI= +k8s.io/apimachinery v0.17.0 h1:xRBnuie9rXcPxUkDizUsGvPf1cnlZCFu210op7J7LJo= k8s.io/apimachinery v0.17.0/go.mod h1:b9qmWdKlLuU9EBh+06BtLcSf/Mu89rWL33naRxs1uZg= +k8s.io/client-go v0.17.0 h1:8QOGvUGdqDMFrm9sD6IUFl256BcffynGoe80sxgTEDg= k8s.io/client-go v0.17.0/go.mod h1:TYgR6EUHs6k45hb6KWjVD6jFZvJV4gHDikv/It0xz+k= k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= @@ -439,17 +411,12 @@ k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/kube-openapi v0.0.0-20191107075043-30be4d16710a h1:UcxjrRMyNx/i/y8G7kPvLyy7rfbeuf1PYyBf973pgyU= k8s.io/kube-openapi v0.0.0-20191107075043-30be4d16710a/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E= -k8s.io/utils v0.0.0-20191030222137-2b95a09bc58d/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew= k8s.io/utils v0.0.0-20191114184206-e782cd3c129f/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew= mvdan.cc/interfacer v0.0.0-20180901003855-c20040233aed/go.mod h1:Xkxe497xwlCKkIaQYRfC7CSLworTXY9RMqwhhCm+8Nc= mvdan.cc/lint v0.0.0-20170908181259-adc824a0674b/go.mod h1:2odslEg/xrtNQqCYg2/jCoyKnw3vv5biOc3JnIcYfL4= mvdan.cc/unparam v0.0.0-20190720180237-d51796306d8f/go.mod h1:4G1h5nDURzA3bwVMZIVpwbkw+04kSxk3rAtzlimaUJw= -sigs.k8s.io/kustomize/api v0.2.0 h1:e++6JpysnnlUbHmFrv6jvfF5rFlgQ103bS1DO7r5bWA= -sigs.k8s.io/kustomize/api v0.2.0/go.mod h1:zVtMg179jW1gr74jo9fc2Ac9dLYLTZZThc3DDb9lDW4= -sigs.k8s.io/kustomize/api v0.3.0/go.mod h1:4jaPCtRzxfQLFdYq4gYo40dBGW1hyPp/f4AuiZB5dAQ= -sigs.k8s.io/kustomize/pluginator/v2 v2.0.0/go.mod h1:zrXhTv8BAKt0egmZX/8AtMOSFUSWM9YuoHvvqz8/eHE= -sigs.k8s.io/kustomize/pseudo/k8s v0.1.0 h1:otg4dLFc03c3gzl+2CV8GPGcd1kk8wjXwD+UhhcCn5I= -sigs.k8s.io/kustomize/pseudo/k8s v0.1.0/go.mod h1:bl/gVJgYYhJZCZdYU2BfnaKYAlqFkgbJEkpl302jEss= +sigs.k8s.io/kustomize/api v0.3.0 h1:riR/YsL75nGb+aIPFdIRiqu21+OZbAXQybDS7+FUYRg= +sigs.k8s.io/kustomize/api v0.3.0/go.mod h1:DWNMJBV1xvLruMpihGgnIPznMwHpwUSrxz6v3gnw5kw= sigs.k8s.io/structured-merge-diff v0.0.0-20190525122527-15d366b2352e/go.mod h1:wWxsB5ozmmv/SG7nM11ayaAW51xMvak/t1r0CSlcokI= sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= From 127541f61056f8b2b244c18e8cdf6a9ad2b13efa Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 14:35:44 -0800 Subject: [PATCH 11/12] Support diffrent modes of running the crawler --- api/internal/crawl/cmd/crawler/crawler.go | 32 ++++++++++++++- .../crawl/config/crawler/job/README.md | 41 +++++++++++++++++++ .../crawl/config/crawler/job/job.yaml | 2 +- 3 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 api/internal/crawl/config/crawler/job/README.md diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index ea73efd04..7c62522af 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -3,6 +3,7 @@ package main import ( "context" "fmt" + "log" "net/http" "os" "time" @@ -23,11 +24,31 @@ const ( retryCount = 3 githubUserEnv = "GITHUB_USER" githubRepoEnv = "GITHUB_REPO" + crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY" + crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY" ) +// countEnvs count the environment variables whose values are not empty. +func countEnvs(envs ...string) int { + count := 0 + for _, env := range envs { + if env != "" { + count++ + } + } + return count +} + func main() { githubUser := os.Getenv(githubUserEnv) githubRepo := os.Getenv(githubRepoEnv) + crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv) + crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv) + + if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 { + log.Fatalf("only one of [%s, %s, %s, %s] should be set", + githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv) + } githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { @@ -122,6 +143,13 @@ func main() { } crawlers := []crawler.Crawler{ghCrawler} - crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) - crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + + if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" { + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + } else if crawlIndexOnly == "true" { + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + } else { + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + } } diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md new file mode 100644 index 000000000..3fc3e665b --- /dev/null +++ b/api/internal/crawl/config/crawler/job/README.md @@ -0,0 +1,41 @@ +There are three ways of running the crawler job. + +# Crawling all the documents in the index and crawling all the kustomization files on Github + +This is the default setting of the crawler job. + +# Crawling all the documents in the index + +Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this: + +``` + - name: CRAWL_INDEX_ONLY + value: true +``` + +# Crawling all the kustomization files on Github + +Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this: + +``` + - name: CRAWL_GITHUB_ONLY + value: true +``` + +# Crawling all the kustomization files in a Github repo + +Add the environment variable `GITHUB_REPO` into the crawler container. For example: + +``` + - name: GITHUB_REPO + value: kubernetes-sigs/kustomize +``` + +# Crawling all the kustomization files in all the repositories of a Github user + +Add the environment variable `GITHUB_USER` into the crawler container. For example: + +``` + - name: GITHUB_USER + value: kubernetes-sigs +``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index dde0de398..6dd8d4c97 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -8,7 +8,7 @@ spec: restartPolicy: OnFailure containers: - name: crawler - image: gcr.io/kustomize-search/crawler:latest + image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always env: - name: GITHUB_ACCESS_TOKEN From be2e03681d5151c39cb1764290cfd7a98ae69f9d Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 14:56:52 -0800 Subject: [PATCH 12/12] Remove unused param from IndexFunc --- api/internal/crawl/cmd/crawler/crawler.go | 157 ++++++++++++------ .../crawl/config/crawler/job/README.md | 39 +++-- .../crawl/config/crawler/job/job.yaml | 2 + api/internal/crawl/crawler/crawler.go | 6 +- api/internal/crawl/crawler/crawler_test.go | 2 +- 5 files changed, 135 insertions(+), 71 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 7c62522af..fdbafeafd 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -22,34 +22,53 @@ const ( redisCacheURL = "REDIS_CACHE_URL" redisKeyURL = "REDIS_KEY_URL" retryCount = 3 - githubUserEnv = "GITHUB_USER" - githubRepoEnv = "GITHUB_REPO" - crawlIndexOnlyEnv = "CRAWL_INDEX_ONLY" - crawlGithubOnlyEnv = "CRAWL_GITHUB_ONLY" ) -// countEnvs count the environment variables whose values are not empty. -func countEnvs(envs ...string) int { - count := 0 - for _, env := range envs { - if env != "" { - count++ - } +type CrawlMode int +const ( + CrawlUnknown CrawlMode = iota + // Crawl all the kustomization files in all the repositories of a Github user + CrawlUser + // Crawl all the kustomization files in a Github repo + CrawlRepo + // Crawl all the documents in the index + CrawlIndex + // Crawl all the kustomization files on Github + CrawlGithub + // Crawl all the documents in the index and crawling all the kustomization files on Github + CrawlIndexAndGithub +) + +func NewCrawlMode(s string) CrawlMode { + switch s { + case "github-user": + return CrawlUser + case "github-repo": + return CrawlRepo + case "": + return CrawlIndexAndGithub + case "index": + return CrawlIndex + case "github": + return CrawlGithub + default: + return CrawlUnknown } - return count +} + +func Usage() { + fmt.Printf("Usage: %s [mode] [githubUser|githubRepo]\n", os.Args[0]) + fmt.Printf("\tmode can be one of [github-user, github-repo, index, github]\n") + fmt.Printf("%s: crawl all the documents in the index and crawling all the kustomization files on Github\n", os.Args[0]) + fmt.Printf("%s index: crawl all the documents in the index\n", os.Args[0]) + fmt.Printf("%s gihub: crawl all the kustomization files on Github\n", os.Args[0]) + fmt.Printf("%s github-user : Crawl all the kustomization files in all the repositories of a Github user\n", os.Args[0]) + fmt.Printf("\tFor example, %s github-user kubernetes-sigs\n", os.Args[0]) + fmt.Printf("%s github-repo : Crawl all the kustomization files in a Github repo\n", os.Args[0]) + fmt.Printf("\tFor example, %s github-repo kubernetes-sigs/kustomize\n", os.Args[0]) } func main() { - githubUser := os.Getenv(githubUserEnv) - githubRepo := os.Getenv(githubRepoEnv) - crawlIndexOnly := os.Getenv(crawlIndexOnlyEnv) - crawlGithubOnly := os.Getenv(crawlGithubOnlyEnv) - - if countEnvs(githubUser, githubRepo, crawlIndexOnly, crawlGithubOnly) > 1 { - log.Fatalf("only one of [%s, %s, %s, %s] should be set", - githubUserEnv, githubRepoEnv, crawlIndexOnlyEnv, crawlGithubOnlyEnv) - } - githubToken := os.Getenv(githubAccessTokenVar) if githubToken == "" { fmt.Printf("Must set the variable '%s' to make github requests.\n", @@ -64,8 +83,6 @@ func main() { return } - seedDocs := make(crawler.CrawlSeed, 0) - cacheURL := os.Getenv(redisCacheURL) cache, err := redis.DialURL(cacheURL) clientCache := &http.Client{} @@ -86,7 +103,7 @@ func main() { } // Index updates the value in the index. - indexFunc := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { + indexFunc := func(cdoc crawler.CrawledDocument, mode index.Mode) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: switch mode { @@ -106,30 +123,41 @@ func main() { // This helps avoid indexing a given document multiple times. seen := make(map[string]struct{}) - var ghCrawler crawler.Crawler - - if githubRepo != "" { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml"), - github.Repo(githubRepo)), - ) - } else if githubUser != "" { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml"), - github.User(githubUser)), - ) + var mode CrawlMode + if len(os.Args) == 1 { + mode = CrawlIndexAndGithub } else { - ghCrawler = github.NewCrawler(githubToken, retryCount, clientCache, - github.QueryWith( - github.Filename("kustomization.yaml"), - github.Filename("kustomization.yml")), - ) + mode = NewCrawlMode(os.Args[1]) + } - // get all the documents in the index + ghCrawlerConstructor := func(user, repo string) crawler.Crawler { + if user != "" { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.User(user)), + ) + } else if repo != "" { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml"), + github.Repo(repo)), + ) + } else { + return github.NewCrawler(githubToken, retryCount, clientCache, + github.QueryWith( + github.Filename("kustomization.yaml"), + github.Filename("kustomization.yml")), + ) + } + } + + seedDocs := make(crawler.CrawlSeed, 0) + + // get all the documents in the index + getSeedDocsFunc := func() { query := []byte(`{ "query":{ "match_all":{} } }`) it := idx.IterateQuery(query, 10000, 60*time.Second) for it.Next() { @@ -142,14 +170,35 @@ func main() { } } - crawlers := []crawler.Crawler{ghCrawler} - - if crawlGithubOnly == "true" || githubRepo != "" || githubUser != "" { - crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) - } else if crawlIndexOnly == "true" { - crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) - } else { + switch mode { + case CrawlIndexAndGithub: + getSeedDocsFunc() + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlIndex: + getSeedDocsFunc() + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) + case CrawlGithub: + crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlUser: + if len(os.Args) < 3 { + Usage() + log.Fatalf("Please specify a github user!") + } + crawlers := []crawler.Crawler{ghCrawlerConstructor(os.Args[2], "")} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlRepo: + if len(os.Args) < 3 { + Usage() + log.Fatalf("Please specify a github repo!") + } + crawlers := []crawler.Crawler{ghCrawlerConstructor("", os.Args[2])} + crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) + case CrawlUnknown: + Usage() + log.Fatalf("The crawler mode must be one of [github-user, github-repo, index, github]") } } diff --git a/api/internal/crawl/config/crawler/job/README.md b/api/internal/crawl/config/crawler/job/README.md index 3fc3e665b..3570f27ee 100644 --- a/api/internal/crawl/config/crawler/job/README.md +++ b/api/internal/crawl/config/crawler/job/README.md @@ -2,40 +2,53 @@ There are three ways of running the crawler job. # Crawling all the documents in the index and crawling all the kustomization files on Github -This is the default setting of the crawler job. +This is the default setting of the crawler job. The `command` and `args` field +of the container should be: + +``` + command: ["/crawler"] + args: [] +``` + +Or + +``` + command: ["/crawler"] + args: [""] +``` # Crawling all the documents in the index -Set the environment variable `CRAWL_INDEX_ONLY` to `true` like this: +The `command` and `args` field of the container should be: ``` - - name: CRAWL_INDEX_ONLY - value: true + command: ["/crawler"] + args: ["index"] ``` # Crawling all the kustomization files on Github -Set the environment variable `CRAWL_GITHUB_ONLY` to `true` like this: +The `command` and `args` field of the container should be: ``` - - name: CRAWL_GITHUB_ONLY - value: true + command: ["/crawler"] + args: ["github"] ``` # Crawling all the kustomization files in a Github repo -Add the environment variable `GITHUB_REPO` into the crawler container. For example: +The `command` and `args` field of the container should be like: ``` - - name: GITHUB_REPO - value: kubernetes-sigs/kustomize + command: ["/crawler"] + args: ["github-repo", "kubernetes-sigs/kustomize"] ``` # Crawling all the kustomization files in all the repositories of a Github user -Add the environment variable `GITHUB_USER` into the crawler container. For example: +The `command` and `args` field of the container should be like: ``` - - name: GITHUB_USER - value: kubernetes-sigs + command: ["/crawler"] + args: ["github-user", "kubernetes-sigs"] ``` diff --git a/api/internal/crawl/config/crawler/job/job.yaml b/api/internal/crawl/config/crawler/job/job.yaml index 6dd8d4c97..28e36bcb8 100644 --- a/api/internal/crawl/config/crawler/job/job.yaml +++ b/api/internal/crawl/config/crawler/job/job.yaml @@ -10,6 +10,8 @@ spec: - name: crawler image: gcr.io/haiyanmeng-gke-dev/crawler:v1 imagePullPolicy: Always + command: ["/crawler"] + args: ["github-repo", "kubernetes-sigs/kustomize"] env: - name: GITHUB_ACCESS_TOKEN valueFrom: diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index d46cf161d..31cabc2b7 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -49,7 +49,7 @@ type CrawledDocument interface { type CrawlSeed []*doc.Document -type IndexFunc func(CrawledDocument, Crawler, index.Mode) error +type IndexFunc func(CrawledDocument, index.Mode) error type Converter func(*doc.Document) (CrawledDocument, error) func logIfErr(err error) { @@ -74,7 +74,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen[cdoc.ID()] = struct{}{} // Insert into index - if err := indx(cdoc, match, index.InsertOrUpdate); err != nil { + if err := indx(cdoc, index.InsertOrUpdate); err != nil { logger.Printf("Failed to insert or update %s %s: %v", cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err) return @@ -142,7 +142,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C Document: *tail, } seen[cdoc.ID()] = struct{}{} - if err := indx(cdoc, match, index.Delete); err != nil { + if err := indx(cdoc, index.Delete); err != nil { logger.Printf("Failed to delete %s %s: %v", cdoc.RepositoryURL, cdoc.FilePath, err) } diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 00a619c46..41a848612 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -318,7 +318,7 @@ resources: Document: *d, }, nil }, - func(d CrawledDocument, cr Crawler, mode index.Mode) error { + func(d CrawledDocument, mode index.Mode) error { visited[d.ID()]++ return nil },