From 2c2aa928cc17c9fa2b4a17dcc757b72840760683 Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Tue, 17 Dec 2019 09:32:11 -0800 Subject: [PATCH] Delete non-existing documents from the index --- api/internal/crawl/cmd/crawler/crawler.go | 14 +++++++--- api/internal/crawl/crawler/crawler.go | 27 +++++++++++++++----- api/internal/crawl/crawler/crawler_test.go | 3 ++- api/internal/crawl/crawler/github/crawler.go | 5 +--- api/internal/crawl/index/kustomize.go | 11 ++++++++ 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 7e14f7072..c4e913706 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -65,12 +65,18 @@ func main() { } // Index updates the value in the index. - index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error { + index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error { switch d := cdoc.(type) { case *doc.KustomizationDocument: - fmt.Println("Inserting: ", d) - _, err := idx.Put(d.ID(), d) - return err + switch mode { + case index.Delete: + fmt.Println("Deleting: ", d) + return idx.Delete(d.ID()) + default: + fmt.Println("Inserting: ", d) + _, err := idx.Put(d.ID(), d) + return err + } default: return fmt.Errorf("type %T not supported", d) } diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 30d95ec0f..49c2403d1 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -8,6 +8,7 @@ import ( "fmt" "log" "os" + "sigs.k8s.io/kustomize/api/internal/crawl/index" "sync" _ "github.com/gomodule/redigo/redis" @@ -47,7 +48,7 @@ type CrawledDocument interface { type CrawlSeed []*doc.Document -type IndexFunc func(CrawledDocument, Crawler) error +type IndexFunc func(CrawledDocument, Crawler, index.Mode) error type Converter func(*doc.Document) (CrawledDocument, error) func logIfErr(err error) { @@ -72,8 +73,9 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen[cdoc.ID()] = struct{}{} // Insert into index - if err := indx(cdoc, match); err != nil { - logger.Println("Failed to index: ", err) + if err := indx(cdoc, match, index.InsertOrUpdate); err != nil { + logger.Printf("Failed to insert or update %s %s: %v", + cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err) return } @@ -101,6 +103,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C FetchDocumentErrCount := 0 SetCreatedErrCount := 0 convErrCount := 0 + deleteDocCount := 0 // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { @@ -133,6 +136,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) FetchDocumentErrCount++ + // delete the document from the index + cdoc := &doc.KustomizationDocument{ + Document: *tail, + } + seen[cdoc.ID()] = struct{}{} + if err := indx(cdoc, match, index.Delete); err != nil { + logger.Printf("Failed to delete %s %s: %v", + cdoc.RepositoryURL, cdoc.FilePath, err) + } + deleteDocCount++ continue } @@ -140,7 +153,6 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("SetCreated failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) SetCreatedErrCount++ - continue } cdoc, err := conv(tail) @@ -160,9 +172,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount) logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount) logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount) - logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount) - logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount) - logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount) + logger.Printf("\t%d documents cannot be fetched, %d out of them are deleted\n", + FetchDocumentErrCount, deleteDocCount) + logger.Printf("\t%d documents cannot update its creation time but still were inserted or updated in the index\n", SetCreatedErrCount) + logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount) } // CrawlFromSeed updates all the documents in seed, and crawls all the new diff --git a/api/internal/crawl/crawler/crawler_test.go b/api/internal/crawl/crawler/crawler_test.go index 64887b1fc..ec479facf 100644 --- a/api/internal/crawl/crawler/crawler_test.go +++ b/api/internal/crawl/crawler/crawler_test.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "reflect" + "sigs.k8s.io/kustomize/api/internal/crawl/index" "sort" "strings" "sync" @@ -316,7 +317,7 @@ resources: Document: *d, }, nil }, - func(d CrawledDocument, cr Crawler) error { + func(d CrawledDocument, cr Crawler, mode index.Mode) error { visited[d.ID()]++ return nil }, diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 83a118553..fec1628b0 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -580,10 +580,7 @@ func (gcl GhClient) getWithRetry( retryCount := gcl.retryCount - for err == nil && - resp.StatusCode == http.StatusForbidden && - retryCount > 0 { - + for resp.StatusCode == http.StatusForbidden && retryCount > 0 { retryTime := resp.Header.Get("Retry-After") i, errAtoi := strconv.Atoi(retryTime) if errAtoi != nil { diff --git a/api/internal/crawl/index/kustomize.go b/api/internal/crawl/index/kustomize.go index 79833d3fd..430142fc3 100644 --- a/api/internal/crawl/index/kustomize.go +++ b/api/internal/crawl/index/kustomize.go @@ -16,6 +16,12 @@ const ( AggregationKeyword = "aggs" ) +type Mode int +const ( + InsertOrUpdate = iota + Delete +) + // Redefinition of Hits structure. Must match the json string of // KustomizeResult.Hits.Hits. Declared as a convenience for iteration. type KustomizeHits []struct { @@ -301,6 +307,11 @@ func (ki *KustomizeIndex) Put(id string, doc *doc.KustomizationDocument) (string return id, nil } +// Delete a document with a given id from the kustomize index. +func (ki *KustomizeIndex) Delete(id string) error { + return ki.index.Delete(id) +} + // Kustomize search options: What metrics should be returned? Kind Aggregation, // TimeseriesAggregation, etc. Also embedds the SearchOptions field to specify // the position in the sorted list of results and the number of results to return.