Improve the efficiency of crawling github by skipping the documents

already in the index
This commit is contained in:
Haiyan Meng
2020-01-24 19:55:56 -08:00
parent b7b88cae76
commit 154208d331
2 changed files with 8 additions and 0 deletions

View File

@@ -187,6 +187,12 @@ func main() {
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
case CrawlGithub:
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
// add all the documents in the index into seen.
// this greatly reduces the time overhead of CrawlGithub.
getSeedDocsFunc()
for _, d := range seedDocs {
seen[d.ID()] = d.FileType
}
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
case CrawlUser:
if *githubUserPtr == "" {

View File

@@ -82,6 +82,8 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen.Set(cdoc.ID(), cdoc.GetDocument().FileType)
match.SetDefaultBranch(cdoc.GetDocument())
// Insert into index
if err := indx(cdoc, index.InsertOrUpdate); err != nil {
logger.Printf("Failed to insert or update doc(%s): %v",