Add a summary for doCrawl

This commit is contained in:
Haiyan Meng
2019-12-16 12:13:27 -08:00
parent 8c89f0946c
commit 5598d35e4b

View File

@@ -93,7 +93,15 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc,
seen map[string]struct{}, stack *CrawlSeed) { seen map[string]struct{}, stack *CrawlSeed) {
docCount := 0
UpdatedDocCount := 0
seenDocCount := 0
cachedDocCount := 0
findMatchErrCount := 0
FetchDocumentErrCount := 0
SetCreatedErrCount := 0
convErrCount := 0
// During the execution of the for loop, more Documents may be added into (*docsPtr). // During the execution of the for loop, more Documents may be added into (*docsPtr).
for len(*docsPtr) > 0 { for len(*docsPtr) > 0 {
// get the last Document in (*docPtr), which will be crawled in this iteration. // get the last Document in (*docPtr), which will be crawled in this iteration.
@@ -103,18 +111,20 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
*docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)]
if _, ok := seen[tail.ID()]; ok { if _, ok := seen[tail.ID()]; ok {
seenDocCount++
continue continue
} }
docCount++
if tail.WasCached() { if tail.WasCached() {
logger.Printf("%s %s is cached already", tail.RepositoryURL, tail.FilePath) logger.Printf("%s %s is cached already", tail.RepositoryURL, tail.FilePath)
cachedDocCount++
continue continue
} }
match := findMatch(tail, crawlers) match := findMatch(tail, crawlers)
if match == nil { if match == nil {
logIfErr(fmt.Errorf("%v could not match any crawler", tail)) logIfErr(fmt.Errorf("%v could not match any crawler", tail))
findMatchErrCount++
continue continue
} }
@@ -122,23 +132,37 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
if err := match.FetchDocument(ctx, tail); err != nil { if err := match.FetchDocument(ctx, tail); err != nil {
logger.Printf("FetchDocument failed on %s %s: %v", logger.Printf("FetchDocument failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err) tail.RepositoryURL, tail.FilePath, err)
FetchDocumentErrCount++
continue continue
} }
if err := match.SetCreated(ctx, tail); err != nil { if err := match.SetCreated(ctx, tail); err != nil {
logger.Printf("SetCreated failed on %s %s: %v", logger.Printf("SetCreated failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err) tail.RepositoryURL, tail.FilePath, err)
SetCreatedErrCount++
continue continue
} }
cdoc, err := conv(tail) cdoc, err := conv(tail)
// If conv returns an error, cdoc can still be added into the index so that // If conv returns an error, cdoc can still be added into the index so that
// cdoc.Document can be searched. // cdoc.Document can be searched.
logIfErr(err) if err != nil {
logger.Printf("conv failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err)
convErrCount++
}
UpdatedDocCount++
addBranches(cdoc, match, indx, seen, stack) addBranches(cdoc, match, indx, seen, stack)
} }
logger.Printf("%d documents were crawled by doCrawl\n", docCount) logger.Printf("Summary of doCrawl:\n")
logger.Printf("\t%d documents were updated\n", UpdatedDocCount)
logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount)
logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount)
logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount)
logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount)
logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount)
logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount)
} }
// CrawlFromSeed updates all the documents in seed, and crawls all the new // CrawlFromSeed updates all the documents in seed, and crawls all the new