Delete non-existing documents from the index

This commit is contained in:
Haiyan Meng
2019-12-17 09:32:11 -08:00
parent 1eb713157c
commit 2c2aa928cc
5 changed files with 44 additions and 16 deletions

View File

@@ -65,12 +65,18 @@ func main() {
}
// Index updates the value in the index.
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler, mode index.Mode) error {
switch d := cdoc.(type) {
case *doc.KustomizationDocument:
fmt.Println("Inserting: ", d)
_, err := idx.Put(d.ID(), d)
return err
switch mode {
case index.Delete:
fmt.Println("Deleting: ", d)
return idx.Delete(d.ID())
default:
fmt.Println("Inserting: ", d)
_, err := idx.Put(d.ID(), d)
return err
}
default:
return fmt.Errorf("type %T not supported", d)
}

View File

@@ -8,6 +8,7 @@ import (
"fmt"
"log"
"os"
"sigs.k8s.io/kustomize/api/internal/crawl/index"
"sync"
_ "github.com/gomodule/redigo/redis"
@@ -47,7 +48,7 @@ type CrawledDocument interface {
type CrawlSeed []*doc.Document
type IndexFunc func(CrawledDocument, Crawler) error
type IndexFunc func(CrawledDocument, Crawler, index.Mode) error
type Converter func(*doc.Document) (CrawledDocument, error)
func logIfErr(err error) {
@@ -72,8 +73,9 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen[cdoc.ID()] = struct{}{}
// Insert into index
if err := indx(cdoc, match); err != nil {
logger.Println("Failed to index: ", err)
if err := indx(cdoc, match, index.InsertOrUpdate); err != nil {
logger.Printf("Failed to insert or update %s %s: %v",
cdoc.GetDocument().RepositoryURL, cdoc.GetDocument().FilePath, err)
return
}
@@ -101,6 +103,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
FetchDocumentErrCount := 0
SetCreatedErrCount := 0
convErrCount := 0
deleteDocCount := 0
// During the execution of the for loop, more Documents may be added into (*docsPtr).
for len(*docsPtr) > 0 {
@@ -133,6 +136,16 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
logger.Printf("FetchDocument failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err)
FetchDocumentErrCount++
// delete the document from the index
cdoc := &doc.KustomizationDocument{
Document: *tail,
}
seen[cdoc.ID()] = struct{}{}
if err := indx(cdoc, match, index.Delete); err != nil {
logger.Printf("Failed to delete %s %s: %v",
cdoc.RepositoryURL, cdoc.FilePath, err)
}
deleteDocCount++
continue
}
@@ -140,7 +153,6 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
logger.Printf("SetCreated failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err)
SetCreatedErrCount++
continue
}
cdoc, err := conv(tail)
@@ -160,9 +172,10 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount)
logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount)
logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount)
logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount)
logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount)
logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount)
logger.Printf("\t%d documents cannot be fetched, %d out of them are deleted\n",
FetchDocumentErrCount, deleteDocCount)
logger.Printf("\t%d documents cannot update its creation time but still were inserted or updated in the index\n", SetCreatedErrCount)
logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount)
}
// CrawlFromSeed updates all the documents in seed, and crawls all the new

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"reflect"
"sigs.k8s.io/kustomize/api/internal/crawl/index"
"sort"
"strings"
"sync"
@@ -316,7 +317,7 @@ resources:
Document: *d,
}, nil
},
func(d CrawledDocument, cr Crawler) error {
func(d CrawledDocument, cr Crawler, mode index.Mode) error {
visited[d.ID()]++
return nil
},

View File

@@ -580,10 +580,7 @@ func (gcl GhClient) getWithRetry(
retryCount := gcl.retryCount
for err == nil &&
resp.StatusCode == http.StatusForbidden &&
retryCount > 0 {
for resp.StatusCode == http.StatusForbidden && retryCount > 0 {
retryTime := resp.Header.Get("Retry-After")
i, errAtoi := strconv.Atoi(retryTime)
if errAtoi != nil {

View File

@@ -16,6 +16,12 @@ const (
AggregationKeyword = "aggs"
)
type Mode int
const (
InsertOrUpdate = iota
Delete
)
// Redefinition of Hits structure. Must match the json string of
// KustomizeResult.Hits.Hits. Declared as a convenience for iteration.
type KustomizeHits []struct {
@@ -301,6 +307,11 @@ func (ki *KustomizeIndex) Put(id string, doc *doc.KustomizationDocument) (string
return id, nil
}
// Delete a document with a given id from the kustomize index.
func (ki *KustomizeIndex) Delete(id string) error {
return ki.index.Delete(id)
}
// Kustomize search options: What metrics should be returned? Kind Aggregation,
// TimeseriesAggregation, etc. Also embedds the SearchOptions field to specify
// the position in the sorted list of results and the number of results to return.