From 154208d331f9397b2c1b902761c565ea1f50e7da Mon Sep 17 00:00:00 2001 From: Haiyan Meng Date: Fri, 24 Jan 2020 19:55:56 -0800 Subject: [PATCH] Improve the efficiency of crawling github by skipping the documents already in the index --- api/internal/crawl/cmd/crawler/crawler.go | 6 ++++++ api/internal/crawl/crawler/crawler.go | 2 ++ 2 files changed, 8 insertions(+) diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index 54bf3f432..e534780d2 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -187,6 +187,12 @@ func main() { crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen) case CrawlGithub: crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")} + // add all the documents in the index into seen. + // this greatly reduces the time overhead of CrawlGithub. + getSeedDocsFunc() + for _, d := range seedDocs { + seen[d.ID()] = d.FileType + } crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUser: if *githubUserPtr == "" { diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 9a6a401d6..7488fcf3d 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -82,6 +82,8 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, seen.Set(cdoc.ID(), cdoc.GetDocument().FileType) + match.SetDefaultBranch(cdoc.GetDocument()) + // Insert into index if err := indx(cdoc, index.InsertOrUpdate); err != nil { logger.Printf("Failed to insert or update doc(%s): %v",