mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-12 01:14:22 +00:00
Optimize memory usage by avoiding accumulating all the referred
documents into a single stack.
This commit is contained in:
@@ -159,40 +159,31 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
seedDocs := make(crawler.CrawlSeed, 0)
|
query := []byte(`{ "query":{ "match_all":{} } }`)
|
||||||
|
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
||||||
// get all the documents in the index
|
|
||||||
getSeedDocsFunc := func() {
|
|
||||||
query := []byte(`{ "query":{ "match_all":{} } }`)
|
|
||||||
it := idx.IterateQuery(query, 10000, 60*time.Second)
|
|
||||||
for it.Next() {
|
|
||||||
for _, hit := range it.Value().Hits.Hits {
|
|
||||||
seedDocs = append(seedDocs, hit.Document.Document.Copy())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := it.Err(); err != nil {
|
|
||||||
log.Fatalf("getSeedDocsFunc Error iterating: %v\n", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
switch mode {
|
switch mode {
|
||||||
case CrawlIndexAndGithub:
|
case CrawlIndexAndGithub:
|
||||||
getSeedDocsFunc()
|
|
||||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||||
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
|
crawler.CrawlFromSeedIterator(ctx, it, crawlers, docConverter, indexFunc, seen)
|
||||||
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
||||||
case CrawlIndex:
|
case CrawlIndex:
|
||||||
getSeedDocsFunc()
|
|
||||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||||
crawler.CrawlFromSeed(ctx, seedDocs, crawlers, docConverter, indexFunc, seen)
|
crawler.CrawlFromSeedIterator(ctx, it, crawlers, docConverter, indexFunc, seen)
|
||||||
case CrawlGithub:
|
case CrawlGithub:
|
||||||
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
crawlers := []crawler.Crawler{ghCrawlerConstructor("", "")}
|
||||||
// add all the documents in the index into seen.
|
// add all the documents in the index into seen.
|
||||||
// this greatly reduces the time overhead of CrawlGithub.
|
// this greatly reduces the time overhead of CrawlGithub.
|
||||||
getSeedDocsFunc()
|
for it.Next() {
|
||||||
for _, d := range seedDocs {
|
for _, hit := range it.Value().Hits.Hits {
|
||||||
seen.Set(d.ID(), d.FileType)
|
d := hit.Document.Document
|
||||||
|
seen.Set(d.ID(), d.FileType)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if err := it.Err(); err != nil {
|
||||||
|
log.Fatalf("Error iterating the index: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen)
|
||||||
case CrawlUser:
|
case CrawlUser:
|
||||||
if *githubUserPtr == "" {
|
if *githubUserPtr == "" {
|
||||||
|
|||||||
@@ -213,21 +213,36 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
|
|||||||
logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount)
|
logger.Printf("\t%d documents cannot be converted but still were inserted or updated in the index\n", convErrCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CrawlFromSeedIterator iterates all the documents in the index and call CrawlFromSeed for each document.
|
||||||
|
func CrawlFromSeedIterator(ctx context.Context, it *index.KustomizeIterator, crawlers []Crawler,
|
||||||
|
conv Converter, indx IndexFunc, seen utils.SeenMap) {
|
||||||
|
docCount := 0
|
||||||
|
for it.Next() {
|
||||||
|
for _, hit := range it.Value().Hits.Hits {
|
||||||
|
docCount++
|
||||||
|
logger.Printf("updating document %d from seed\n", docCount)
|
||||||
|
|
||||||
|
singleSeed := CrawlSeed{&(hit.Document.Document)}
|
||||||
|
CrawlFromSeed(ctx, singleSeed, crawlers, conv, indx, seen)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := it.Err(); err != nil {
|
||||||
|
log.Fatalf("Error iterating the index: %v\n", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
||||||
// documents referred in the seed.
|
// documents referred in the seed.
|
||||||
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
||||||
conv Converter, indx IndexFunc, seen utils.SeenMap) {
|
conv Converter, indx IndexFunc, seen utils.SeenMap) {
|
||||||
|
|
||||||
// stack tracks the documents directly referred in other documents.
|
// stack tracks the documents directly referred in the seed.
|
||||||
stack := make(CrawlSeed, 0)
|
stack := make(CrawlSeed, 0)
|
||||||
|
|
||||||
// Exploit seed to update bulk of corpus.
|
|
||||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
|
||||||
// each unique document in seed will be crawled once.
|
// each unique document in seed will be crawled once.
|
||||||
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false)
|
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false)
|
||||||
|
|
||||||
// Traverse any new documents added while updating corpus.
|
logger.Printf("crawling %d new documents referred by doc\n", len(stack))
|
||||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
|
||||||
// While crawling each document in stack, the documents directly referred in the document
|
// While crawling each document in stack, the documents directly referred in the document
|
||||||
// will be added into stack.
|
// will be added into stack.
|
||||||
// After this statement is done, stack will become empty.
|
// After this statement is done, stack will become empty.
|
||||||
@@ -297,8 +312,6 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
|
|||||||
// CrawlGithub crawls all the kustomization files on Github.
|
// CrawlGithub crawls all the kustomization files on Github.
|
||||||
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||||
indx IndexFunc, seen utils.SeenMap) {
|
indx IndexFunc, seen utils.SeenMap) {
|
||||||
// stack tracks the documents directly referred in other documents.
|
|
||||||
stack := make(CrawlSeed, 0)
|
|
||||||
|
|
||||||
// ch is channel where all the crawlers sends the crawled documents to.
|
// ch is channel where all the crawlers sends the crawled documents to.
|
||||||
ch := make(chan CrawledDocument, 1<<10)
|
ch := make(chan CrawledDocument, 1<<10)
|
||||||
@@ -324,7 +337,20 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
|||||||
"%v could not match any crawler", cdoc))
|
"%v could not match any crawler", cdoc))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// stack tracks the documents directly referred in the document.
|
||||||
|
stack := make(CrawlSeed, 0)
|
||||||
|
|
||||||
addBranches(cdoc, match, indx, seen, &stack)
|
addBranches(cdoc, match, indx, seen, &stack)
|
||||||
|
|
||||||
|
if len(stack) > 0 {
|
||||||
|
// here the documents referred in a kustomization file are crawled separately,
|
||||||
|
// to avoid accumulating all the referred documents into a single gigantic
|
||||||
|
// mem-inentive stack.
|
||||||
|
logger.Printf("crawling the %d new documents referred in doc %d",
|
||||||
|
len(stack), docCount)
|
||||||
|
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -336,9 +362,4 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
|||||||
}
|
}
|
||||||
close(ch)
|
close(ch)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
// Handle deps of newly discovered documents.
|
|
||||||
logger.Printf("crawling the %d new documents referred by other documents",
|
|
||||||
len(stack))
|
|
||||||
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user