Improve the efficiency of crawling github by skipping the documents

already in the index
2026-06-10 08:20:59 +00:00 · 2020-01-24 19:55:56 -08:00
parent b7b88cae76
commit 154208d331
2 changed files with 8 additions and 0 deletions
--- a/api/internal/crawl/cmd/crawler/crawler.go
+++ b/api/internal/crawl/cmd/crawler/crawler.go
@@ -187,6 +187,12 @@ func main() {
 		crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
 	case CrawlGithub:
 		crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
+		// add all the documents in the index into seen.
+		// this greatly reduces the time overhead of CrawlGithub.
+		getSeedDocsFunc()
+		for _, d := range seedDocs {
+			seen[d.ID()] = d.FileType
+		}
 		crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
 	case CrawlUser:
 		if *githubUserPtr == "" {
--- a/api/internal/crawl/crawler/crawler.go
+++ b/api/internal/crawl/crawler/crawler.go
@@ -82,6 +82,8 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,

 	seen.Set(cdoc.ID(), cdoc.GetDocument().FileType)

+	match.SetDefaultBranch(cdoc.GetDocument())
+
 	// Insert into index
 	if err := indx(cdoc, index.InsertOrUpdate); err != nil {
 		logger.Printf("Failed to insert or update doc(%s): %v",