Mulitple improvements of the crawler

1) Set document IDs to avoid duplicating documents; 2) Set the `creationTime` field of each document in the index; 3) set the `values`, `kinds` and `identifiers` fields for all documents; 4) Add a `Copy` method into the `Document` struct: this fixes the issue where all the documents existing in the index point to the same Document object; 5) Avoid using keystore redis; 6) Set imagePullPolicy to `Always` for crawler jobs.
2026-06-14 10:30:59 +00:00 · 2019-12-05 09:51:22 -08:00
parent 54b1549586
commit bffc0d7071
13 changed files with 125 additions and 36 deletions
--- a/api/internal/crawl/doc/doc.go
+++ b/api/internal/crawl/doc/doc.go
@@ -44,9 +44,9 @@ type KustomizationDocument struct {
 type set map[string]struct{}

 func (doc *KustomizationDocument) String() string {
-	return fmt.Sprintf("%s %s %s %v %v %v %v %v", doc.RepositoryURL, doc.FilePath,
-		doc.DefaultBranch, doc.CreationTime, doc.IsSame,
-		doc.Kinds, doc.Identifiers, doc.Values)
+	return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v  len(values):%v",
+		doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime,
+		doc.IsSame,	doc.Kinds, len(doc.Identifiers), len(doc.Values))
 }

 // Implements the CrawlerDocument interface.