mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 17:12:51 +00:00
Optimize memory usage by avoiding accumulating all the referred
documents into a single stack.
This commit is contained in:
@@ -159,40 +159,31 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
seedDocs := make(crawler.CrawlSeed, 0)
|
||||
|
||||
// get all the documents in the index
|
||||
getSeedDocsFunc := func() {
|
||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
seedDocs = append(seedDocs, hit.Document.Document.Copy())
|
||||
}
|
||||
}
|
||||
if err := it.Err(); err != nil {
|
||||
log.Fatalf("getSeedDocsFunc Error iterating: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
switch mode {
|
||||
case CrawlIndexAndGithub:
|
||||
getSeedDocsFunc()
|
||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
|
||||
crawler.CrawlFromSeedIterator(ctx, it, crawlers, docConverter, indexFunc, seen)
|
||||
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
||||
case CrawlIndex:
|
||||
getSeedDocsFunc()
|
||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
|
||||
crawler.CrawlFromSeedIterator(ctx, it, crawlers, docConverter, indexFunc, seen)
|
||||
case CrawlGithub:
|
||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||
// add all the documents in the index into seen.
|
||||
// this greatly reduces the time overhead of CrawlGithub.
|
||||
getSeedDocsFunc()
|
||||
for _, d := range seedDocs {
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
d := hit.Document.Document
|
||||
seen.Set(d.ID(), d.FileType)
|
||||
}
|
||||
}
|
||||
if err := it.Err(); err != nil {
|
||||
log.Fatalf("Error iterating the index: %v\n", err)
|
||||
}
|
||||
|
||||
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
||||
case CrawlUser:
|
||||
if *githubUserPtr == "" {
|
||||
|
||||
@@ -213,21 +213,36 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
||||
logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount)
|
||||
}
|
||||
|
||||
// CrawlFromSeedIterator iterates all the documents in the index and call CrawlFromSeed for each document.
|
||||
func CrawlFromSeedIterator(ctx context.Context, it *index.KustomizeIterator, crawlers []Crawler,
|
||||
conv Converter, indx IndexFunc, seen utils.SeenMap) {
|
||||
docCount := 0
|
||||
for it.Next() {
|
||||
for _, hit := range it.Value().Hits.Hits {
|
||||
docCount++
|
||||
logger.Printf("updating document %d from seed\n", docCount)
|
||||
|
||||
singleSeed := CrawlSeed{&(hit.Document.Document)}
|
||||
CrawlFromSeed(ctx, singleSeed, crawlers, conv, indx, seen)
|
||||
}
|
||||
}
|
||||
if err := it.Err(); err != nil {
|
||||
log.Fatalf("Error iterating the index: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
||||
// documents referred in the seed.
|
||||
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
||||
conv Converter, indx IndexFunc, seen utils.SeenMap) {
|
||||
|
||||
// stack tracks the documents directly referred in other documents.
|
||||
// stack tracks the documents directly referred in the seed.
|
||||
stack := make(CrawlSeed, 0)
|
||||
|
||||
// Exploit seed to update bulk of corpus.
|
||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||
// each unique document in seed will be crawled once.
|
||||
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false)
|
||||
|
||||
// Traverse any new documents added while updating corpus.
|
||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
||||
logger.Printf("crawling %d new documents referred by doc\n", len(stack))
|
||||
// While crawling each document in stack, the documents directly referred in the document
|
||||
// will be added into stack.
|
||||
// After this statement is done, stack will become empty.
|
||||
@@ -297,8 +312,6 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
|
||||
// CrawlGithub crawls all the kustomization files on Github.
|
||||
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||
indx IndexFunc, seen utils.SeenMap) {
|
||||
// stack tracks the documents directly referred in other documents.
|
||||
stack := make(CrawlSeed, 0)
|
||||
|
||||
// ch is channel where all the crawlers sends the crawled documents to.
|
||||
ch := make(chan CrawledDocument, 1<<10)
|
||||
@@ -324,7 +337,20 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||
"%v could not match any crawler", cdoc))
|
||||
continue
|
||||
}
|
||||
|
||||
// stack tracks the documents directly referred in the document.
|
||||
stack := make(CrawlSeed, 0)
|
||||
|
||||
addBranches(cdoc, match, indx, seen, &stack)
|
||||
|
||||
if len(stack) > 0 {
|
||||
// here the documents referred in a kustomization file are crawled separately,
|
||||
// to avoid accumulating all the referred documents into a single gigantic
|
||||
// mem-inentive stack.
|
||||
logger.Printf("crawling the %d new documents referred in doc %d",
|
||||
len(stack), docCount)
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -336,9 +362,4 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||
}
|
||||
close(ch)
|
||||
wg.Wait()
|
||||
|
||||
// Handle deps of newly discovered documents.
|
||||
logger.Printf("crawling the %d new documents referred by other documents",
|
||||
len(stack))
|
||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user