Add a fileType field into the index

This commit is contained in:
Haiyan Meng
2020-01-16 11:28:41 -08:00
parent 5dde9485a2
commit f4636f8555
12 changed files with 224 additions and 52 deletions

View File

@@ -126,7 +126,7 @@ func main() {
} }
} }
// seen tracks the IDs of all the documents in the index. // seen tracks the IDs of all the documents in the index and their corresponding file types.
// This helps avoid indexing a given document multiple times. // This helps avoid indexing a given document multiple times.
seen := utils.NewSeenMap() seen := utils.NewSeenMap()

View File

@@ -38,6 +38,8 @@ type Crawler interface {
// Write to the document what the created time is. // Write to the document what the created time is.
SetCreated(context.Context, *doc.Document) error SetCreated(context.Context, *doc.Document) error
SetDefaultBranch(*doc.Document)
Match(*doc.Document) bool Match(*doc.Document) bool
} }
@@ -78,7 +80,7 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen utils.SeenMap, stack *CrawlSeed) { seen utils.SeenMap, stack *CrawlSeed) {
seen.Add(cdoc.ID()) seen.Set(cdoc.ID(), cdoc.GetDocument().FileType)
// Insert into index // Insert into index
if err := indx(cdoc, index.InsertOrUpdate); err != nil { if err := indx(cdoc, index.InsertOrUpdate); err != nil {
@@ -87,14 +89,14 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
return return
} }
deps, err := cdoc.GetResources(true, false, false) deps, err := cdoc.GetResources(true, true, true)
if err != nil { if err != nil {
logger.Println(err) logger.Println(err)
return return
} }
for _, dep := range deps { for _, dep := range deps {
if seen.Seen(dep.ID()) { if seen.Seen(dep.ID()) && seen.Value(dep.ID()) == dep.FileType {
continue continue
} }
*stack = append(*stack, dep) *stack = append(*stack, dep)
@@ -102,7 +104,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
} }
func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc,
seen utils.SeenMap, stack *CrawlSeed) { seen utils.SeenMap, stack *CrawlSeed, refreshDoc bool, updateFileType bool) {
UpdatedDocCount := 0 UpdatedDocCount := 0
seenDocCount := 0 seenDocCount := 0
@@ -126,9 +128,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
logger.Printf("Crawling doc %d: %s", crawledDocCount, tail.Path()) logger.Printf("Crawling doc %d: %s", crawledDocCount, tail.Path())
if seen.Seen(tail.ID()) { if seen.Seen(tail.ID()) {
logger.Printf("this doc has been seen before") if !updateFileType || seen.Value(tail.ID()) == tail.FileType {
seenDocCount++ logger.Printf("this doc has been seen before")
continue seenDocCount++
continue
}
} }
if tail.WasCached() { if tail.WasCached() {
@@ -151,26 +155,34 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// calling FetchDocument. Otherwise, the binary may enter into an infinite loop // calling FetchDocument. Otherwise, the binary may enter into an infinite loop
// if a kustomization file points to its kustmozation root in its `resources` or // if a kustomization file points to its kustmozation root in its `resources` or
// `bases` field. // `bases` field.
seen.Add(tail.ID()) seen.Set(tail.ID(), tail.FileType)
if err := match.FetchDocument(ctx, tail); err != nil { if refreshDoc || tail.DefaultBranch == "" {
logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err) match.SetDefaultBranch(tail)
FetchDocumentErrCount++
// delete the document from the index
cdoc := &doc.KustomizationDocument{
Document: *tail,
}
seen.Add(cdoc.ID())
if err := indx(cdoc, index.Delete); err != nil {
logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err)
}
deleteDocCount++
continue
} }
if err := match.SetCreated(ctx, tail); err != nil { if refreshDoc || tail.DocumentData == "" {
logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err) if err := match.FetchDocument(ctx, tail); err != nil {
SetCreatedErrCount++ logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err)
FetchDocumentErrCount++
// delete the document from the index
cdoc := &doc.KustomizationDocument{
Document: *tail,
}
seen.Set(cdoc.ID(), tail.FileType)
if err := indx(cdoc, index.Delete); err != nil {
logger.Printf("Failed to delete doc(%s): %v", cdoc.Path(), err)
}
deleteDocCount++
continue
}
}
if refreshDoc || tail.CreationTime == nil {
if err := match.SetCreated(ctx, tail); err != nil {
logger.Printf("SetCreated failed on doc(%s): %v", tail.Path(), err)
SetCreatedErrCount++
}
} }
cdoc, err := conv(tail) cdoc, err := conv(tail)
@@ -206,14 +218,14 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
// Exploit seed to update bulk of corpus. // Exploit seed to update bulk of corpus.
logger.Printf("updating %d documents from seed\n", len(seed)) logger.Printf("updating %d documents from seed\n", len(seed))
// each unique document in seed will be crawled once. // each unique document in seed will be crawled once.
doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack) doCrawl(ctx, &seed, crawlers, conv, indx, seen, &stack, true, false)
// Traverse any new documents added while updating corpus. // Traverse any new documents added while updating corpus.
logger.Printf("crawling %d new documents found in the seed\n", len(stack)) logger.Printf("crawling %d new documents found in the seed\n", len(stack))
// While crawling each document in stack, the documents directly referred in the document // While crawling each document in stack, the documents directly referred in the document
// will be added into stack. // will be added into stack.
// After this statement is done, stack will become empty. // After this statement is done, stack will become empty.
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack) doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
} }
// CrawlGithubRunner is a blocking function and only returns once all of the // CrawlGithubRunner is a blocking function and only returns once all of the
@@ -294,6 +306,8 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
for cdoc := range ch { for cdoc := range ch {
docCount++ docCount++
logger.Printf("Processing doc %d found on Github", docCount) logger.Printf("Processing doc %d found on Github", docCount)
// all the docs here are kustomization files found by querying Github, and
// their `FileType` fields all should be empty.
if seen.Seen(cdoc.ID()) { if seen.Seen(cdoc.ID()) {
logger.Printf("the doc has been seen before") logger.Printf("the doc has been seen before")
continue continue
@@ -320,5 +334,5 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
// Handle deps of newly discovered documents. // Handle deps of newly discovered documents.
logger.Printf("crawling the %d new documents referred by other documents", logger.Printf("crawling the %d new documents referred by other documents",
len(stack)) len(stack))
doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack) doCrawl(ctx, &stack, crawlers, conv, indx, seen, &stack, false, true)
} }

View File

@@ -37,6 +37,8 @@ func (c testCrawler) Match(d *doc.Document) bool {
return d != nil return d != nil
} }
func (c testCrawler) SetDefaultBranch(d *doc.Document) {}
func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error { func (c testCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
if i, ok := c.lukp[d.ID()]; ok { if i, ok := c.lukp[d.ID()]; ok {
d.DocumentData = c.docs[i].DocumentData d.DocumentData = c.docs[i].DocumentData

View File

@@ -60,8 +60,16 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
} }
} }
func (gc githubCrawler) SetDefaultBranch(repo, branch string) { func (gc githubCrawler) SetDefaultBranch(d *doc.Document) {
gc.branchMap[repo] = branch url := gc.client.ReposRequest(d.RepositoryFullName())
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
if err != nil {
logger.Printf(
"(error: %v) setting default_branch to master\n", err)
defaultBranch = "master"
}
d.DefaultBranch = defaultBranch
gc.branchMap[d.RepositoryURL] = d.DefaultBranch
} }
func (gc githubCrawler) DefaultBranch(repo string) string { func (gc githubCrawler) DefaultBranch(repo string) string {
@@ -114,19 +122,6 @@ func (gc githubCrawler) Crawl(ctx context.Context,
// it will try to add each string in konfig.RecognizedKustomizationFileNames() to // it will try to add each string in konfig.RecognizedKustomizationFileNames() to
// d.FilePath, and try to fetch the document again. // d.FilePath, and try to fetch the document again.
func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error { func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
// set the default branch if it is empty
if d.DefaultBranch == "" {
url := gc.client.ReposRequest(d.RepositoryFullName())
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
if err != nil {
logger.Printf(
"(error: %v) setting default_branch to master\n", err)
defaultBranch = "master"
}
d.DefaultBranch = defaultBranch
}
gc.SetDefaultBranch(d.RepositoryURL, d.DefaultBranch)
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
repoSpec, err := git.NewRepoSpecFromUrl(repoURL) repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
if err != nil { if err != nil {
@@ -283,6 +278,8 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
defaultBranch = "master" defaultBranch = "master"
} }
// document here is a kustomization file found by querying Github, whose
// `FileType` field should be empty.
document := doc.Document{ document := doc.Document{
FilePath: k.Path, FilePath: k.Path,
DefaultBranch: defaultBranch, DefaultBranch: defaultBranch,

View File

@@ -87,17 +87,17 @@ func (doc *KustomizationDocument) GetResources(
res := make([]*Document, 0) res := make([]*Document, 0)
if includeResources { if includeResources {
resourceDocs := doc.CollectDocuments(k.Resources) resourceDocs := doc.CollectDocuments(k.Resources, "resource")
res = append(res, resourceDocs...) res = append(res, resourceDocs...)
} }
if includeGenerators { if includeGenerators {
generatorDocs := doc.CollectDocuments(k.Generators) generatorDocs := doc.CollectDocuments(k.Generators, "generator")
res = append(res, generatorDocs...) res = append(res, generatorDocs...)
} }
if includeTransformers { if includeTransformers {
transformerDocs := doc.CollectDocuments(k.Transformers) transformerDocs := doc.CollectDocuments(k.Transformers, "transformer")
res = append(res, transformerDocs...) res = append(res, transformerDocs...)
} }
@@ -106,7 +106,8 @@ func (doc *KustomizationDocument) GetResources(
// CollectDocuments construct a Document for each path in paths, and return // CollectDocuments construct a Document for each path in paths, and return
// a slice of Document pointers. // a slice of Document pointers.
func (doc *KustomizationDocument) CollectDocuments(paths []string) []*Document { func (doc *KustomizationDocument) CollectDocuments(
paths []string, fileType string) []*Document {
docs := make([]*Document, 0, len(paths)) docs := make([]*Document, 0, len(paths))
for _, r := range paths { for _, r := range paths {
if strings.TrimSpace(r) == "" { if strings.TrimSpace(r) == "" {
@@ -117,6 +118,7 @@ func (doc *KustomizationDocument) CollectDocuments(paths []string) []*Document {
log.Printf("CollectDocuments error: %v\n", err) log.Printf("CollectDocuments error: %v\n", err)
continue continue
} }
next.FileType = fileType
docs = append(docs, &next) docs = append(docs, &next)
} }
return docs return docs

View File

@@ -215,19 +215,23 @@ resources:
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/base", FilePath: "some/path/to/base",
FileType: "resource",
}, },
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/otherbase", FilePath: "some/path/to/otherbase",
FileType: "resource",
}, },
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/file.yaml", FilePath: "some/path/to/kdir/file.yaml",
FileType: "resource",
}, },
{ {
RepositoryURL: "https://github.com/kubernetes-sigs/kustomize", RepositoryURL: "https://github.com/kubernetes-sigs/kustomize",
FilePath: "examples/helloWorld", FilePath: "examples/helloWorld",
DefaultBranch: "v3.1.0", DefaultBranch: "v3.1.0",
FileType: "resource",
}, },
}, },
}, },
@@ -312,10 +316,12 @@ transformers:
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/gen.yaml", FilePath: "some/path/to/kdir/gen.yaml",
FileType: "generator",
}, },
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/file.yaml", FilePath: "some/path/to/kdir/file.yaml",
FileType: "resource",
}, },
}, },
}, },
@@ -345,14 +351,17 @@ transformers:
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/tr.yaml", FilePath: "some/path/to/kdir/tr.yaml",
FileType: "transformer",
}, },
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/gen.yaml", FilePath: "some/path/to/kdir/gen.yaml",
FileType: "generator",
}, },
{ {
RepositoryURL: "sigs.k8s.io/kustomize", RepositoryURL: "sigs.k8s.io/kustomize",
FilePath: "some/path/to/kdir/file.yaml", FilePath: "some/path/to/kdir/file.yaml",
FileType: "resource",
}, },
}, },
}, },

View File

@@ -17,6 +17,9 @@ type Document struct {
DocumentData string `json:"document,omitempty"` DocumentData string `json:"document,omitempty"`
CreationTime *time.Time `json:"creationTime,omitempty"` CreationTime *time.Time `json:"creationTime,omitempty"`
IsSame bool `json:"-"` IsSame bool `json:"-"`
// FileType can be one of the following:
// "generator", "transformer", "resource", "".
FileType string `json:"fileType,omitempty"`
} }
// Implements the CrawlerDocument interface. // Implements the CrawlerDocument interface.
@@ -32,6 +35,7 @@ func (doc *Document) Copy() *Document {
DocumentData: doc.DocumentData, DocumentData: doc.DocumentData,
CreationTime: doc.CreationTime, CreationTime: doc.CreationTime,
IsSame: doc.IsSame, IsSame: doc.IsSame,
FileType: doc.FileType,
} }
} }

View File

@@ -22,7 +22,7 @@ func (uds *UniqueDocuments) Add(d *Document) {
return return
} }
uds.docs = append(uds.docs, d) uds.docs = append(uds.docs, d)
uds.docIDs.Add(d.ID()) uds.docIDs.Set(d.ID(), "")
} }
func (uds *UniqueDocuments) AddDocuments(docs []*Document) { func (uds *UniqueDocuments) AddDocuments(docs []*Document) {

View File

@@ -26,6 +26,9 @@ const IndexConfig = `
"defaultBranch": { "defaultBranch": {
"type": "keyword" "type": "keyword"
}, },
"fileType": {
"type": "keyword"
},
"document": { "document": {
"type": "text" "type": "text"
}, },

View File

@@ -0,0 +1,123 @@
Find all the documents having the `fileType` field set:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"exists": {
"field": "fileType"
}
}
}
'
```
Find all the documents whose `fileType` field is not set:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
{
"size": 10000,
"query": {
"bool": {
"must_not": {
"exists": {
"field": "fileType"
}
}
}
}
}
'
```
Search for all the documents whose `fileType` field is `resource`:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"filter": [
{ "regexp": { "fileType": "resource" }}
]
}
}
}
'
```
Count distinct values of the `fileType` field:
```
curl -X POST "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d'
{
"aggs" : {
"fileType_count" : {
"cardinality" : {
"field" : "fileType",
"precision_threshold": 40000
}
}
}
}
'
```
List all the values of the `fileType` field and the frequency of each value:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d'
{
"aggs" : {
"fileType" : {
"terms" : {
"field" : "fileType"
}
}
}
}
'
```
For all the kustomization files in the index, list all the values of the
`fileType` field and the frequency of each value:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"filter": [
{ "regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }}
]
}
},
"aggs" : {
"fileType" : {
"terms" : {
"field" : "fileType"
}
}
}
}
'
```
For all the non-kustomization files in the index, list all the values of the
`fileType` field and the frequency of each value:
```
curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?size=0&pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"bool": {
"must_not": {
"regexp": { "filePath": ".*/kustomization((.yaml)?|(.yml)?)/*" }
}
}
},
"aggs" : {
"fileType" : {
"terms" : {
"field" : "fileType"
}
}
}
}
'
```

View File

@@ -17,3 +17,16 @@ Delete the kustomize index from the ElasticSearch cluster (**Use this command wi
``` ```
curl -X DELETE "${ElasticSearchURL}:9200/${INDEXNAME}?pretty" curl -X DELETE "${ElasticSearchURL}:9200/${INDEXNAME}?pretty"
``` ```
Add a new field into an existing index.
```
curl -X PUT "${ElasticSearchURL}:9200/${INDEXNAME}/_mapping/_doc?pretty" -H 'Content-Type: application/json' -d'
{
"properties": {
"fileType": {
"type": "keyword"
}
}
}
'
```

View File

@@ -1,16 +1,21 @@
package utils package utils
type SeenMap map[string]struct{} type SeenMap map[string]string
func (seen SeenMap) Seen(item string) bool { func (seen SeenMap) Seen(item string) bool {
_, ok := seen[item] _, ok := seen[item]
return ok return ok
} }
func (seen SeenMap) Add(item string) { func (seen SeenMap) Set(k, v string) {
seen[item] = struct{}{} seen[k] = v
}
// The caller should make sure that key is in the map.
func (seen SeenMap) Value(k string) string {
return seen[k]
} }
func NewSeenMap() SeenMap { func NewSeenMap() SeenMap {
return make(map[string]struct{}) return make(map[string]string)
} }