Improve the efficiency of crawling github by skipping the documents

already in the index
This commit is contained in:
Haiyan Meng
2020-01-24 19:55:56 -08:00
parent b7b88cae76
commit 154208d331
2 changed files with 8 additions and 0 deletions

View File

@@ -82,6 +82,8 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen.Set(cdoc.ID(), cdoc.GetDocument().FileType)
match.SetDefaultBranch(cdoc.GetDocument())
// Insert into index
if err := indx(cdoc, index.InsertOrUpdate); err != nil {
logger.Printf("Failed to insert or update doc(%s): %v",