Mulitple improvements of the crawler

1) Set document IDs to avoid duplicating documents; 2) Set the `creationTime` field of each document in the index; 3) set the `values`, `kinds` and `identifiers` fields for all documents; 4) Add a `Copy` method into the `Document` struct: this fixes the issue where all the documents existing in the index point to the same Document object; 5) Avoid using keystore redis; 6) Set imagePullPolicy to `Always` for crawler jobs.
2026-06-11 17:12:51 +00:00 · 2019-12-05 09:51:22 -08:00
parent 54b1549586
commit bffc0d7071
13 changed files with 125 additions and 36 deletions
--- a/api/internal/crawl/crawler/crawler.go
+++ b/api/internal/crawl/crawler/crawler.go
@@ -102,11 +102,9 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
 	}

 	doCrawl := func(docsPtr *CrawlSeed) {
-		for len(*docsPtr) > 0 {
-			back := len(*docsPtr) - 1
-			next := (*docsPtr)[back]
-			*docsPtr = (*docsPtr)[:back]
-
+		n := len(*docsPtr)
+		for i := 0; i < n; i++ {
+			next := (*docsPtr)[i]
 			match := findMatch(next)
 			if match == nil {
 				logIfErr(fmt.Errorf(
@@ -114,24 +112,28 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
 				continue
 			}

+			logger.Println("Crawling ", next.RepositoryURL, next.FilePath)
 			err := match.FetchDocument(ctx, next)
 			logIfErr(err)
 			// If there was no change or there is an error, we don't have
 			// to branch out, since the dependencies are already in the
 			// index, or we cannot find the document.
 			if err != nil || next.WasCached() {
+				if next.WasCached() {
+					logger.Println(next.RepositoryURL, next.FilePath, "is cached already")
+				}
 				continue
 			}

+			logIfErr(match.SetCreated(ctx, next))
+
 			cdoc, err := conv(next)
 			logIfErr(err)
-			if err != nil {
-				continue
-			}

 			addBranches(cdoc, match)
 		}
 	}
+
 	// Exploit seed to update bulk of corpus.
 	logger.Printf("updating %d documents from seed\n", len(seed))
 	doCrawl(&seed)