Move SeenMap to the utils dir

This commit is contained in:
Haiyan Meng
2020-01-15 12:04:22 -08:00
parent aaaba99389
commit cf8d53a195
10 changed files with 85 additions and 40 deletions

View File

@@ -9,6 +9,8 @@ import (
"os" "os"
"time" "time"
"sigs.k8s.io/kustomize/api/internal/crawl/utils"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"
"sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/doc"
@@ -26,6 +28,7 @@ const (
) )
type CrawlMode int type CrawlMode int
const ( const (
CrawlUnknown CrawlMode = iota CrawlUnknown CrawlMode = iota
// Crawl all the kustomization files in all the repositories of a Github user // Crawl all the kustomization files in all the repositories of a Github user
@@ -125,13 +128,13 @@ func main() {
// seen tracks the IDs of all the documents in the index. // seen tracks the IDs of all the documents in the index.
// This helps avoid indexing a given document multiple times. // This helps avoid indexing a given document multiple times.
seen := crawler.NewSeenMap() seen := utils.NewSeenMap()
mode := NewCrawlMode(*modePtr) mode := NewCrawlMode(*modePtr)
ghCrawlerConstructor := func(user, repo string) crawler.Crawler { ghCrawlerConstructor := func(user, repo string) crawler.Crawler {
if user != "" { if user != "" {
return github.NewCrawler(githubToken, retryCount, clientCache, return github.NewCrawler(githubToken, retryCount, clientCache,
github.QueryWith( github.QueryWith(
github.Filename("kustomization.yaml"), github.Filename("kustomization.yaml"),
github.Filename("kustomization.yml"), github.Filename("kustomization.yml"),

View File

@@ -7,7 +7,6 @@ import (
"log" "log"
"net/http" "net/http"
"os" "os"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sort" "sort"
"time" "time"
@@ -51,10 +50,10 @@ func GeneratorAndTransformerStats(ctx context.Context,
generatorDocs []*doc.Document, transformerDocs []*doc.Document, generatorDocs []*doc.Document, transformerDocs []*doc.Document,
idx *index.KustomizeIndex) { idx *index.KustomizeIndex) {
// allGenerators includes all the documents referred in the generators field // allGenerators includes all the documents referred in the generators field
allGenerators := crawler.NewUniqueDocuments() allGenerators := doc.NewUniqueDocuments()
// allTransformers includes all the documents referred in the transformers field // allTransformers includes all the documents referred in the transformers field
allTransformers := crawler.NewUniqueDocuments() allTransformers := doc.NewUniqueDocuments()
// docUsingGeneratorCount counts the number of the kustomization files using generators // docUsingGeneratorCount counts the number of the kustomization files using generators
docUsingGeneratorCount := 0 docUsingGeneratorCount := 0

View File

@@ -10,6 +10,8 @@ import (
"os" "os"
"sync" "sync"
"sigs.k8s.io/kustomize/api/internal/crawl/utils"
"sigs.k8s.io/kustomize/api/internal/crawl/index" "sigs.k8s.io/kustomize/api/internal/crawl/index"
_ "github.com/gomodule/redigo/redis" _ "github.com/gomodule/redigo/redis"
@@ -29,7 +31,7 @@ type Crawler interface {
// Crawl returns when it is done processing. This method does not take // Crawl returns when it is done processing. This method does not take
// ownership of the channel. The channel is write only, and it // ownership of the channel. The channel is write only, and it
// designates where the crawler should forward the documents. // designates where the crawler should forward the documents.
Crawl(ctx context.Context, output chan<- CrawledDocument, seen SeenMap) error Crawl(ctx context.Context, output chan<- CrawledDocument, seen utils.SeenMap) error
// Get the document data given the FilePath, Repo, and Ref/Tag/Branch. // Get the document data given the FilePath, Repo, and Ref/Tag/Branch.
FetchDocument(context.Context, *doc.Document) error FetchDocument(context.Context, *doc.Document) error
@@ -52,21 +54,6 @@ type CrawledDocument interface {
WasCached() bool WasCached() bool
} }
type SeenMap map[string]struct{}
func (seen SeenMap) Seen(item string) bool {
_, ok := seen[item]
return ok
}
func (seen SeenMap) Add(item string) {
seen[item] = struct{}{}
}
func NewSeenMap() SeenMap {
return make(map[string]struct{})
}
type CrawlSeed []*doc.Document type CrawlSeed []*doc.Document
type IndexFunc func(CrawledDocument, index.Mode) error type IndexFunc func(CrawledDocument, index.Mode) error
@@ -89,7 +76,7 @@ func findMatch(d *doc.Document, crawlers []Crawler) Crawler {
} }
func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc, func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
seen SeenMap, stack *CrawlSeed) { seen utils.SeenMap, stack *CrawlSeed) {
seen.Add(cdoc.ID()) seen.Add(cdoc.ID())
@@ -115,7 +102,7 @@ func addBranches(cdoc CrawledDocument, match Crawler, indx IndexFunc,
} }
func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc, func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv Converter, indx IndexFunc,
seen SeenMap, stack *CrawlSeed) { seen utils.SeenMap, stack *CrawlSeed) {
UpdatedDocCount := 0 UpdatedDocCount := 0
seenDocCount := 0 seenDocCount := 0
@@ -166,7 +153,6 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// `bases` field. // `bases` field.
seen.Add(tail.ID()) seen.Add(tail.ID())
if err := match.FetchDocument(ctx, tail); err != nil { if err := match.FetchDocument(ctx, tail); err != nil {
logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err) logger.Printf("FetchDocument failed on doc(%s): %v", tail.Path(), err)
FetchDocumentErrCount++ FetchDocumentErrCount++
@@ -212,7 +198,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// CrawlFromSeed updates all the documents in seed, and crawls all the new // CrawlFromSeed updates all the documents in seed, and crawls all the new
// documents referred in the seed. // documents referred in the seed.
func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler, func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
conv Converter, indx IndexFunc, seen SeenMap) { conv Converter, indx IndexFunc, seen utils.SeenMap) {
// stack tracks the documents directly referred in other documents. // stack tracks the documents directly referred in other documents.
stack := make(CrawlSeed, 0) stack := make(CrawlSeed, 0)
@@ -248,7 +234,7 @@ func CrawlFromSeed(ctx context.Context, seed CrawlSeed, crawlers []Crawler,
// from the seed will be processed before any other documents from the // from the seed will be processed before any other documents from the
// crawlers. // crawlers.
func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument, func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
crawlers []Crawler, seen SeenMap) []error { crawlers []Crawler, seen utils.SeenMap) []error {
errs := make([]error, len(crawlers)) errs := make([]error, len(crawlers))
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
@@ -292,7 +278,7 @@ func CrawlGithubRunner(ctx context.Context, output chan<- CrawledDocument,
// CrawlGithub crawls all the kustomization files on Github. // CrawlGithub crawls all the kustomization files on Github.
func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter, func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
indx IndexFunc, seen SeenMap) { indx IndexFunc, seen utils.SeenMap) {
// stack tracks the documents directly referred in other documents. // stack tracks the documents directly referred in other documents.
stack := make(CrawlSeed, 0) stack := make(CrawlSeed, 0)

View File

@@ -12,6 +12,8 @@ import (
"testing" "testing"
"time" "time"
"sigs.k8s.io/kustomize/api/internal/crawl/utils"
"sigs.k8s.io/kustomize/api/internal/crawl/index" "sigs.k8s.io/kustomize/api/internal/crawl/index"
"sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/doc"
@@ -76,7 +78,7 @@ func newCrawler(matchPrefix string, err error,
// Crawl implements the Crawler interface for testing. // Crawl implements the Crawler interface for testing.
func (c testCrawler) Crawl(_ context.Context, func (c testCrawler) Crawl(_ context.Context,
output chan<- CrawledDocument, _ SeenMap) error { output chan<- CrawledDocument, _ utils.SeenMap) error {
for i, d := range c.docs { for i, d := range c.docs {
isResource := true isResource := true
@@ -182,7 +184,7 @@ func TestCrawlGithubRunner(t *testing.T) {
defer close(output) defer close(output)
defer wg.Done() defer wg.Done()
seen := NewSeenMap() seen := utils.NewSeenMap()
errs := CrawlGithubRunner(context.Background(), errs := CrawlGithubRunner(context.Background(),
output, test.tc, seen) output, test.tc, seen)
@@ -324,7 +326,7 @@ resources:
visited[d.ID()]++ visited[d.ID()]++
return nil return nil
}, },
NewSeenMap(), utils.NewSeenMap(),
) )
if lv, lc := len(visited), len(tc.corpus); lv != lc { if lv, lc := len(visited), len(tc.corpus); lv != lc {
t.Errorf("error: %d of %d documents visited.", lv, lc) t.Errorf("error: %d of %d documents visited.", lv, lc)

View File

@@ -16,6 +16,8 @@ import (
"strings" "strings"
"time" "time"
"sigs.k8s.io/kustomize/api/internal/crawl/utils"
"sigs.k8s.io/kustomize/api/internal/crawl/crawler" "sigs.k8s.io/kustomize/api/internal/crawl/crawler"
"sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/doc"
"sigs.k8s.io/kustomize/api/internal/crawl/httpclient" "sigs.k8s.io/kustomize/api/internal/crawl/httpclient"
@@ -68,7 +70,7 @@ func (gc githubCrawler) DefaultBranch(repo string) string {
// Implements crawler.Crawler. // Implements crawler.Crawler.
func (gc githubCrawler) Crawl(ctx context.Context, func (gc githubCrawler) Crawl(ctx context.Context,
output chan<- crawler.CrawledDocument, seen crawler.SeenMap) error { output chan<- crawler.CrawledDocument, seen utils.SeenMap) error {
noETagClient := GhClient{ noETagClient := GhClient{
RequestConfig: gc.client.RequestConfig, RequestConfig: gc.client.RequestConfig,
@@ -195,9 +197,9 @@ func (gc githubCrawler) Match(d *doc.Document) bool {
type RangeQueryResult struct { type RangeQueryResult struct {
totalDocCnt uint64 totalDocCnt uint64
seenDocCnt uint64 seenDocCnt uint64
newDocCnt uint64 newDocCnt uint64
errorCnt uint64 errorCnt uint64
} }
func (r *RangeQueryResult) Add(other RangeQueryResult) { func (r *RangeQueryResult) Add(other RangeQueryResult) {
@@ -209,7 +211,7 @@ func (r *RangeQueryResult) Add(other RangeQueryResult) {
func (r *RangeQueryResult) String() string { func (r *RangeQueryResult) String() string {
return fmt.Sprintf("got %d files from API. "+ return fmt.Sprintf("got %d files from API. "+
"%d have been seen before. %d are new and sent to the output channel." + "%d have been seen before. %d are new and sent to the output channel."+
" %d have kustomizationResultAdapter errors.", " %d have kustomizationResultAdapter errors.",
r.totalDocCnt, r.seenDocCnt, r.newDocCnt, r.errorCnt) r.totalDocCnt, r.seenDocCnt, r.newDocCnt, r.errorCnt)
} }
@@ -217,7 +219,7 @@ func (r *RangeQueryResult) String() string {
// processQuery follows all of the pages in a query, and updates/adds the // processQuery follows all of the pages in a query, and updates/adds the
// documents from the crawl to the datastore/index. // documents from the crawl to the datastore/index.
func processQuery(ctx context.Context, gcl GhClient, query string, func processQuery(ctx context.Context, gcl GhClient, query string,
output chan<- crawler.CrawledDocument, seen crawler.SeenMap, output chan<- crawler.CrawledDocument, seen utils.SeenMap,
branchMap map[string]string) (RangeQueryResult, error) { branchMap map[string]string) (RangeQueryResult, error) {
queryPages := make(chan GhResponseInfo) queryPages := make(chan GhResponseInfo)
@@ -271,7 +273,7 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
return result, errs return result, errs
} }
func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen crawler.SeenMap, func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
branchMap map[string]string) (crawler.CrawledDocument, error) { branchMap map[string]string) (crawler.CrawledDocument, error) {
url := gcl.ReposRequest(k.Repository.FullName) url := gcl.ReposRequest(k.Repository.FullName)
defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap) defaultBranch, err := gcl.GetDefaultBranch(url, k.Repository.URL, branchMap)

View File

@@ -117,7 +117,7 @@ type RequestConfig struct {
// understand why the request object is useful. // understand why the request object is useful.
func (rc RequestConfig) CodeSearchRequestWith(query Query) request { func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
vals := url.Values{ vals := url.Values{
"sort": []string{"indexed"}, "sort": []string{"indexed"},
"order": []string{"desc"}, "order": []string{"desc"},
} }
req := rc.makeRequest("search/code", query, vals) req := rc.makeRequest("search/code", query, vals)

View File

@@ -65,7 +65,7 @@ func TestFromRelativePath(t *testing.T) {
func TestDocument_RepositoryFullName(t *testing.T) { func TestDocument_RepositoryFullName(t *testing.T) {
testCases := []struct { testCases := []struct {
doc Document doc Document
expectedRepositoryFullName string expectedRepositoryFullName string
}{ }{
{ {
@@ -108,4 +108,4 @@ func TestDocument_RepositoryFullName(t *testing.T) {
returnedRepositoryFullName) returnedRepositoryFullName)
} }
} }
} }

View File

@@ -0,0 +1,36 @@
package doc
import (
"sigs.k8s.io/kustomize/api/internal/crawl/utils"
)
// UniqueDocuments make sure a Document with a given ID appears only once
type UniqueDocuments struct {
docs []*Document
docIDs utils.SeenMap
}
func NewUniqueDocuments() UniqueDocuments {
return UniqueDocuments{
docs: []*Document{},
docIDs: utils.NewSeenMap(),
}
}
func (uds *UniqueDocuments) Add(d *Document) {
if uds.docIDs.Seen(d.ID()) {
return
}
uds.docs = append(uds.docs, d)
uds.docIDs.Add(d.ID())
}
func (uds *UniqueDocuments) AddDocuments(docs []*Document) {
for _, d := range docs {
uds.Add(d)
}
}
func (uds *UniqueDocuments) Documents() []*Document {
return uds.docs
}

View File

@@ -18,6 +18,7 @@ const (
) )
type Mode int type Mode int
const ( const (
InsertOrUpdate = iota InsertOrUpdate = iota
Delete Delete

View File

@@ -0,0 +1,16 @@
package utils
type SeenMap map[string]struct{}
func (seen SeenMap) Seen(item string) bool {
_, ok := seen[item]
return ok
}
func (seen SeenMap) Add(item string) {
seen[item] = struct{}{}
}
func NewSeenMap() SeenMap {
return make(map[string]struct{})
}