diff --git a/api/internal/crawl/crawler/crawler.go b/api/internal/crawl/crawler/crawler.go index 410c3454b..30d95ec0f 100644 --- a/api/internal/crawl/crawler/crawler.go +++ b/api/internal/crawl/crawler/crawler.go @@ -93,7 +93,15 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, seen map[string]struct{}, stack *CrawlSeed) { - docCount := 0 + + UpdatedDocCount := 0 + seenDocCount := 0 + cachedDocCount := 0 + findMatchErrCount := 0 + FetchDocumentErrCount := 0 + SetCreatedErrCount := 0 + convErrCount := 0 + // During the execution of the for loop, more Documents may be added into (*docsPtr). for len(*docsPtr) > 0 { // get the last Document in (*docPtr), which will be crawled in this iteration. @@ -103,18 +111,20 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C *docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)] if _, ok := seen[tail.ID()]; ok { + seenDocCount++ continue } - docCount++ if tail.WasCached() { logger.Printf("%s %s is cached already", tail.RepositoryURL, tail.FilePath) + cachedDocCount++ continue } match := findMatch(tail, crawlers) if match == nil { logIfErr(fmt.Errorf("%v could not match any crawler", tail)) + findMatchErrCount++ continue } @@ -122,23 +132,37 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C if err := match.FetchDocument(ctx, tail); err != nil { logger.Printf("FetchDocument failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) + FetchDocumentErrCount++ continue } if err := match.SetCreated(ctx, tail); err != nil { logger.Printf("SetCreated failed on %s %s: %v", tail.RepositoryURL, tail.FilePath, err) + SetCreatedErrCount++ continue } cdoc, err := conv(tail) // If conv returns an error, cdoc can still be added into the index so that // cdoc.Document can be searched. - logIfErr(err) + if err != nil { + logger.Printf("conv failed on %s %s: %v", + tail.RepositoryURL, tail.FilePath, err) + convErrCount++ + } + UpdatedDocCount++ addBranches(cdoc, match, indx, seen, stack) } - logger.Printf("%d documents were crawled by doCrawl\n", docCount) + logger.Printf("Summary of doCrawl:\n") + logger.Printf("\t%d documents were updated\n", UpdatedDocCount) + logger.Printf("\t%d documents were seen by the crawler already and skipped\n", seenDocCount) + logger.Printf("\t%d documents were cached already and skipped\n", cachedDocCount) + logger.Printf("\t%d documents didn't have a matching crawler and skipped\n", findMatchErrCount) + logger.Printf("\t%d documents cannot be fetched and skipped\n", FetchDocumentErrCount) + logger.Printf("\t%d documents cannot update its creation time and skipped\n", SetCreatedErrCount) + logger.Printf("\t%d documents cannot be converted and skipped\n", convErrCount) } // CrawlFromSeed updates all the documents in seed, and crawls all the new