Mulitple improvements of the crawler

1) Set document IDs to avoid duplicating documents;
2) Set the `creationTime` field of each document in the index;
3) set the `values`, `kinds` and `identifiers` fields for all documents;
4) Add a `Copy` method into the `Document` struct: this fixes the issue
where all the documents existing in the index point to the same Document
object;
5) Avoid using keystore redis;
6) Set imagePullPolicy to `Always` for crawler jobs.
This commit is contained in:
Haiyan Meng
2019-12-05 09:51:22 -08:00
parent 54b1549586
commit bffc0d7071
13 changed files with 125 additions and 36 deletions

View File

@@ -102,11 +102,9 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
}
doCrawl := func(docsPtr *CrawlSeed) {
for len(*docsPtr) > 0 {
back := len(*docsPtr) - 1
next := (*docsPtr)[back]
*docsPtr = (*docsPtr)[:back]
n := len(*docsPtr)
for i := 0; i < n; i++ {
next := (*docsPtr)[i]
match := findMatch(next)
if match == nil {
logIfErr(fmt.Errorf(
@@ -114,24 +112,28 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
continue
}
logger.Println("Crawling ", next.RepositoryURL, next.FilePath)
err := match.FetchDocument(ctx, next)
logIfErr(err)
// If there was no change or there is an error, we don't have
// to branch out, since the dependencies are already in the
// index, or we cannot find the document.
if err != nil || next.WasCached() {
if next.WasCached() {
logger.Println(next.RepositoryURL, next.FilePath, "is cached already")
}
continue
}
logIfErr(match.SetCreated(ctx, next))
cdoc, err := conv(next)
logIfErr(err)
if err != nil {
continue
}
addBranches(cdoc, match)
}
}
// Exploit seed to update bulk of corpus.
logger.Printf("updating %d documents from seed\n", len(seed))
doCrawl(&seed)