Improve the efficiency of crawling github by skipping the documents

already in the index
This commit is contained in:
Haiyan Meng
2020-01-24 19:55:56 -08:00
parent b7b88cae76
commit 154208d331
2 changed files with 8 additions and 0 deletions

View File

@@ -187,6 +187,12 @@ func main() {
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
case CrawlGithub:
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
// add all the documents in the index into seen.
// this greatly reduces the time overhead of CrawlGithub.
getSeedDocsFunc()
for _, d := range seedDocs {
seen[d.ID()] = d.FileType
}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser:
if *githubUserPtr == "" {