mirror of
https://github.com/kubernetes-sigs/kustomize.git
synced 2026-06-11 17:12:51 +00:00
Separate the two types of crawling
1) crawling the documents in the index to update these documents; 2) crawling the whole github.
This commit is contained in:
@@ -67,19 +67,18 @@ func main() {
|
|||||||
github.Filename("kustomization.yml")),
|
github.Filename("kustomization.yml")),
|
||||||
)
|
)
|
||||||
|
|
||||||
crawler.CrawlFromSeed(ctx, docs, []crawler.Crawler{ghCrawler},
|
// docConverter takes in a plain document and processes it for the index.
|
||||||
// Converter takes in a plain document and processes it for the
|
docConverter := func(d *doc.Document) (crawler.CrawledDocument, error) {
|
||||||
// index.
|
|
||||||
func(d *doc.Document) (crawler.CrawledDocument, error) {
|
|
||||||
kdoc := doc.KustomizationDocument{
|
kdoc := doc.KustomizationDocument{
|
||||||
Document: *d,
|
Document: *d,
|
||||||
}
|
}
|
||||||
|
|
||||||
err := kdoc.ParseYAML()
|
err := kdoc.ParseYAML()
|
||||||
return &kdoc, err
|
return &kdoc, err
|
||||||
},
|
}
|
||||||
// IndexFunc updates the value in the index.
|
|
||||||
func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
// Index updates the value in the index.
|
||||||
|
index := func(cdoc crawler.CrawledDocument, crwlr crawler.Crawler) error {
|
||||||
switch d := cdoc.(type) {
|
switch d := cdoc.(type) {
|
||||||
case *doc.KustomizationDocument:
|
case *doc.KustomizationDocument:
|
||||||
fmt.Println("Inserting: ", d.ID(), d)
|
fmt.Println("Inserting: ", d.ID(), d)
|
||||||
@@ -88,6 +87,15 @@ func main() {
|
|||||||
default:
|
default:
|
||||||
return fmt.Errorf("type %T not supported", d)
|
return fmt.Errorf("type %T not supported", d)
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
)
|
|
||||||
|
// seen tracks the IDs of all the documents in the index.
|
||||||
|
// This helps avoid indexing a given document multiple times.
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
|
||||||
|
crawlers := []crawler.Crawler{ghCrawler}
|
||||||
|
|
||||||
|
crawler.CrawlFromSeed(ctx, docs, crawlers, docConverter, index, seen)
|
||||||
|
|
||||||
|
crawler.CrawlGithub(ctx, crawlers, docConverter, index, seen)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ type Crawler interface {
|
|||||||
type CrawledDocument interface {
|
type CrawledDocument interface {
|
||||||
ID() string
|
ID() string
|
||||||
GetDocument() *doc.Document
|
GetDocument() *doc.Document
|
||||||
|
// Get all the Documents directly referred in a Document.
|
||||||
GetResources() ([]*doc.Document, error)
|
GetResources() ([]*doc.Document, error)
|
||||||
WasCached() bool
|
WasCached() bool
|
||||||
}
|
}
|
||||||
@@ -49,33 +50,24 @@ type CrawlSeed []*doc.Document
|
|||||||
type IndexFunc func(CrawledDocument, Crawler) error
|
type IndexFunc func(CrawledDocument, Crawler) error
|
||||||
type Converter func(*doc.Document) (CrawledDocument, error)
|
type Converter func(*doc.Document) (CrawledDocument, error)
|
||||||
|
|
||||||
// Cleaner, more efficient, and more extensible crawler implementation.
|
func logIfErr(err error) {
|
||||||
// The seed must include the ids of each document in the index.
|
|
||||||
func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
|
|
||||||
crawlers []Crawler, conv Converter, indx IndexFunc) {
|
|
||||||
|
|
||||||
seen := make(map[string]struct{})
|
|
||||||
|
|
||||||
logIfErr := func(err error) {
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
logger.Println("error: ", err)
|
logger.Println("error: ", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
stack := make(CrawlSeed, 0)
|
func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
|
||||||
|
|
||||||
findMatch := func(d *doc.Document) Crawler {
|
|
||||||
for _, crawl := range crawlers {
|
for _, crawl := range crawlers {
|
||||||
if crawl.Match(d) {
|
if crawl.Match(d) {
|
||||||
return crawl
|
return crawl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
addBranches := func(cdoc CrawledDocument, match Crawler) {
|
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
|
||||||
|
seen map[string]struct{}, stack *CrawlSeed) {
|
||||||
if _, ok := seen[cdoc.ID()]; ok {
|
if _, ok := seen[cdoc.ID()]; ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -97,87 +89,74 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
|
|||||||
if _, ok := seen[dep.ID()]; ok {
|
if _, ok := seen[dep.ID()]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
stack = append(stack, dep)
|
*stack = append(*stack, dep)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
doCrawl := func(docsPtr *CrawlSeed) {
|
func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc,
|
||||||
n := len(*docsPtr)
|
seen map[string]struct{}, stack *CrawlSeed) {
|
||||||
for i := 0; i < n; i++ {
|
docCount := 0
|
||||||
next := (*docsPtr)[i]
|
// During the execution of the for loop, more Documents may be added into (*docsPtr).
|
||||||
match := findMatch(next)
|
for len(*docsPtr) > 0 {
|
||||||
|
docCount++
|
||||||
|
// get the last Document in (*docPtr), which will be crawled in this iteration.
|
||||||
|
tail := (*docsPtr)[len(*docsPtr)-1]
|
||||||
|
|
||||||
|
// remove the last Document in (*docPtr)
|
||||||
|
*docsPtr = (*docsPtr)[:(len(*docsPtr)-1)]
|
||||||
|
|
||||||
|
match := findMatch(tail, crawlers)
|
||||||
if match == nil {
|
if match == nil {
|
||||||
logIfErr(fmt.Errorf(
|
logIfErr(fmt.Errorf(
|
||||||
"%v could not match any crawler", next))
|
"%v could not match any crawler", tail))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Println("Crawling ", next.RepositoryURL, next.FilePath)
|
logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath)
|
||||||
err := match.FetchDocument(ctx, next)
|
err := match.FetchDocument(ctx, tail)
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
// If there was no change or there is an error, we don't have
|
// If there was no change or there is an error, we don't have
|
||||||
// to branch out, since the dependencies are already in the
|
// to branch out, since the dependencies are already in the
|
||||||
// index, or we cannot find the document.
|
// index, or we cannot find the document.
|
||||||
if err != nil || next.WasCached() {
|
if err != nil || tail.WasCached() {
|
||||||
if next.WasCached() {
|
if tail.WasCached() {
|
||||||
logger.Println(next.RepositoryURL, next.FilePath, "is cached already")
|
logger.Println(tail.RepositoryURL, tail.FilePath, "is cached already")
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
logIfErr(match.SetCreated(ctx, next))
|
logIfErr(match.SetCreated(ctx, tail))
|
||||||
|
|
||||||
cdoc, err := conv(next)
|
cdoc, err := conv(tail)
|
||||||
logIfErr(err)
|
logIfErr(err)
|
||||||
|
|
||||||
addBranches(cdoc, match)
|
addBranches(cdoc, match, indx, seen, stack)
|
||||||
}
|
}
|
||||||
|
logger.Printf("%d documents were crawled by doCrawl\n", docCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CrawlFromSeed updates all the documents in seed, and crawls all the new
|
||||||
|
// documents referred in the seed.
|
||||||
|
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
|
||||||
|
conv Converter, indx IndexFunc, seen map[string]struct{}) {
|
||||||
|
|
||||||
|
// stack tracks the documents directly referred in other documents.
|
||||||
|
stack := make(CrawlSeed, 0)
|
||||||
|
|
||||||
// Exploit seed to update bulk of corpus.
|
// Exploit seed to update bulk of corpus.
|
||||||
logger.Printf("updating %d documents from seed\n", len(seed))
|
logger.Printf("updating %d documents from seed\n", len(seed))
|
||||||
doCrawl(&seed)
|
// each unique document in seed will be crawled once.
|
||||||
// Traverse any new links added while updating corpus.
|
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack)
|
||||||
|
|
||||||
|
// Traverse any new documents added while updating corpus.
|
||||||
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
logger.Printf("crawling %d new documents found in the seed\n", len(stack))
|
||||||
doCrawl(&stack)
|
// While crawling each document in stack, the documents directly referred in the document
|
||||||
|
// will be added into stack.
|
||||||
ch := make(chan CrawledDocument, 1<<10)
|
// After this statement is done, stack will become empty.
|
||||||
wg := sync.WaitGroup{}
|
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack)
|
||||||
|
|
||||||
wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer wg.Done()
|
|
||||||
for cdoc := range ch {
|
|
||||||
if _, ok := seen[cdoc.ID()]; ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
match := findMatch(cdoc.GetDocument())
|
|
||||||
if match == nil {
|
|
||||||
logIfErr(fmt.Errorf(
|
|
||||||
"%v could not match any crawler", cdoc))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
addBranches(cdoc, match)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Exploration through APIs.
|
|
||||||
errs := CRunner(ctx, ch, crawlers)
|
|
||||||
if errs != nil {
|
|
||||||
for _, err := range errs {
|
|
||||||
logIfErr(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
close(ch)
|
|
||||||
logger.Println("Processing the new documents from the crawlers' exploration.")
|
|
||||||
wg.Wait()
|
|
||||||
// Handle deps of newly discovered documents.
|
|
||||||
logger.Printf("crawling the %d new documents from the crawlers' exploration.",
|
|
||||||
len(stack))
|
|
||||||
doCrawl(&stack)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// CRunner is a blocking function and only returns once all of the
|
// CrawlGithubRunner is a blocking function and only returns once all of the
|
||||||
// crawlers are finished with execution.
|
// crawlers are finished with execution.
|
||||||
//
|
//
|
||||||
// This function uses the output channel to forward kustomization documents
|
// This function uses the output channel to forward kustomization documents
|
||||||
@@ -188,14 +167,14 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed,
|
|||||||
// index of the crawler that emitted the error. Although the errors themselves
|
// index of the crawler that emitted the error. Although the errors themselves
|
||||||
// can be nil, the array will always be exactly the size of the crawlers array.
|
// can be nil, the array will always be exactly the size of the crawlers array.
|
||||||
//
|
//
|
||||||
// CRunner takes in a seed, which represents the documents stored in an
|
// CrawlGithubRunner takes in a seed, which represents the documents stored in an
|
||||||
// index somewhere. The document data is not required to be populated. If there
|
// index somewhere. The document data is not required to be populated. If there
|
||||||
// are many documents, this is preferable. The order of iteration over the seed
|
// are many documents, this is preferable. The order of iteration over the seed
|
||||||
// is not guaranteed, but the CRunner does guarantee that every element
|
// is not guaranteed, but the CrawlGithub does guarantee that every element
|
||||||
// from the seed will be processed before any other documents from the
|
// from the seed will be processed before any other documents from the
|
||||||
// crawlers.
|
// crawlers.
|
||||||
func CRunner(ctx context.Context,
|
func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
|
||||||
output chan<- CrawledDocument, crawlers []Crawler) []error {
|
crawlers []Crawler) []error {
|
||||||
|
|
||||||
errs := make([]error, len(crawlers))
|
errs := make([]error, len(crawlers))
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
@@ -236,3 +215,45 @@ func CRunner(ctx context.Context,
|
|||||||
wg.Wait()
|
wg.Wait()
|
||||||
return errs
|
return errs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CrawlGithub crawls all the kustomization files on Github.
|
||||||
|
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
|
||||||
|
indx IndexFunc, seen map[string]struct{}) {
|
||||||
|
// stack tracks the documents directly referred in other documents.
|
||||||
|
stack := make(CrawlSeed, 0)
|
||||||
|
|
||||||
|
// ch is channel where all the crawlers sends the crawled documents to.
|
||||||
|
ch := make(chan CrawledDocument, 1<<10)
|
||||||
|
|
||||||
|
wg := sync.WaitGroup{}
|
||||||
|
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
for cdoc := range ch {
|
||||||
|
if _, ok := seen[cdoc.ID()]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
match := findMatch(cdoc.GetDocument(), crawlers)
|
||||||
|
if match == nil {
|
||||||
|
logIfErr(fmt.Errorf(
|
||||||
|
"%v could not match any crawler", cdoc))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
addBranches(cdoc, match, indx, seen, &stack)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if errs := CrawlGithubRunner(ctx, ch, crawlers); errs != nil {
|
||||||
|
for _, err := range errs {
|
||||||
|
logIfErr(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(ch)
|
||||||
|
logger.Println("Processing the documents found from crawling github")
|
||||||
|
wg.Wait()
|
||||||
|
// Handle deps of newly discovered documents.
|
||||||
|
logger.Printf("crawling the %d new documents referred by other documents",
|
||||||
|
len(stack))
|
||||||
|
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack)
|
||||||
|
}
|
||||||
|
|||||||
@@ -38,12 +38,13 @@ func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
for _, suffix := range konfig.RecognizedKustomizationFileNames() {
|
for _, suffix := range konfig.RecognizedKustomizationFileNames() {
|
||||||
fmt.Println(d.ID(), "/", suffix)
|
savedFilePath := d.FilePath
|
||||||
i, ok := c.lukp[d.ID()+"/"+suffix]
|
d.FilePath += "/" + suffix
|
||||||
|
i, ok := c.lukp[d.ID()]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
d.FilePath = savedFilePath
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
d.FilePath += "/" + suffix
|
|
||||||
d.DocumentData = c.docs[i].DocumentData
|
d.DocumentData = c.docs[i].DocumentData
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -106,8 +107,8 @@ func (s sortableDocs) Len() int {
|
|||||||
return len(s)
|
return len(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCrawlerRunner(t *testing.T) {
|
func TestCrawlGithubRunner(t *testing.T) {
|
||||||
fmt.Println("testing CRunner")
|
fmt.Println("testing CrawlGithubRunner")
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
tc []Crawler
|
tc []Crawler
|
||||||
errs []error
|
errs []error
|
||||||
@@ -178,7 +179,7 @@ func TestCrawlerRunner(t *testing.T) {
|
|||||||
defer close(output)
|
defer close(output)
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
errs := CRunner(context.Background(),
|
errs := CrawlGithubRunner(context.Background(),
|
||||||
output, test.tc)
|
output, test.tc)
|
||||||
|
|
||||||
// Check that errors are returned as they should be.
|
// Check that errors are returned as they should be.
|
||||||
@@ -302,29 +303,6 @@ resources:
|
|||||||
RepositoryURL: kustomizeRepo,
|
RepositoryURL: kustomizeRepo,
|
||||||
FilePath: "examples/seedcrawl2/job.yaml",
|
FilePath: "examples/seedcrawl2/job.yaml",
|
||||||
}},
|
}},
|
||||||
// Visited from the crawler runner.
|
|
||||||
{Document: doc.Document{
|
|
||||||
RepositoryURL: kustomizeRepo,
|
|
||||||
FilePath: "examples/other/base/kustomization.yaml",
|
|
||||||
DocumentData: `
|
|
||||||
resources:
|
|
||||||
- ../app
|
|
||||||
`,
|
|
||||||
}},
|
|
||||||
// Visited from the crawler runner.
|
|
||||||
{Document: doc.Document{
|
|
||||||
RepositoryURL: kustomizeRepo,
|
|
||||||
FilePath: "examples/other/app/kustomization.yaml",
|
|
||||||
DocumentData: `
|
|
||||||
resources:
|
|
||||||
- resource.yaml
|
|
||||||
`,
|
|
||||||
}},
|
|
||||||
// Visited from crawling runner imported as resource.
|
|
||||||
{Document: doc.Document{
|
|
||||||
RepositoryURL: kustomizeRepo,
|
|
||||||
FilePath: "examples/other/app/resource.yaml",
|
|
||||||
}},
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -342,6 +320,7 @@ resources:
|
|||||||
visited[d.ID()]++
|
visited[d.ID()]++
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
|
make(map[string]struct{}),
|
||||||
)
|
)
|
||||||
if lv, lc := len(visited), len(tc.corpus); lv != lc {
|
if lv, lc := len(visited), len(tc.corpus); lv != lc {
|
||||||
t.Errorf("error: %d of %d documents visited.", lv, lc)
|
t.Errorf("error: %d of %d documents visited.", lv, lc)
|
||||||
|
|||||||
Reference in New Issue
Block a user