Merge pull request #2150 from haiyanmeng/stats

Add `fileType` and `User` into the index
This commit is contained in:
Jeff Regan
2020-01-28 09:18:31 -08:00
committed by GitHub
18 changed files with 1276 additions and 77 deletions

View File

@@ -60,8 +60,16 @@ func NewCrawler(accessToken string, retryCount uint64, client *http.Client,
}
}
func (gc githubCrawler) SetDefaultBranch(repo, branch string) {
gc.branchMap[repo] = branch
func (gc githubCrawler) SetDefaultBranch(d *doc.Document) {
url := gc.client.ReposRequest(d.RepositoryFullName())
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
if err != nil {
logger.Printf(
"(error: %v) setting default_branch to master\n", err)
defaultBranch = "master"
}
d.DefaultBranch = defaultBranch
gc.branchMap[d.RepositoryURL] = d.DefaultBranch
}
func (gc githubCrawler) DefaultBranch(repo string) string {
@@ -79,10 +87,20 @@ func (gc githubCrawler) Crawl(ctx context.Context,
accessToken: gc.client.accessToken,
}
var ranges []string
var err error
// Since Github returns a max of 1000 results per query, we can use
// multiple queries that split the search space into chunks of at most
// 1000 files to get all of the data.
ranges, err := FindRangesForRepoSearch(newCache(noETagClient, gc.query))
for i := 0; i < 5; i++ {
ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query))
if err == nil {
logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i)
break
} else {
time.Sleep(time.Minute)
}
}
if err != nil {
return fmt.Errorf("could not split %v into ranges, %v\n",
gc.query, err)
@@ -114,19 +132,6 @@ func (gc githubCrawler) Crawl(ctx context.Context,
// it will try to add each string in konfig.RecognizedKustomizationFileNames() to
// d.FilePath, and try to fetch the document again.
func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error {
// set the default branch if it is empty
if d.DefaultBranch == "" {
url := gc.client.ReposRequest(d.RepositoryFullName())
defaultBranch, err := gc.client.GetDefaultBranch(url, d.RepositoryURL, gc.branchMap)
if err != nil {
logger.Printf(
"(error: %v) setting default_branch to master\n", err)
defaultBranch = "master"
}
d.DefaultBranch = defaultBranch
}
gc.SetDefaultBranch(d.RepositoryURL, d.DefaultBranch)
repoURL := d.RepositoryURL + "/" + d.FilePath + "?ref=" + d.DefaultBranch
repoSpec, err := git.NewRepoSpecFromUrl(repoURL)
if err != nil {
@@ -283,10 +288,13 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
defaultBranch = "master"
}
// document here is a kustomization file found by querying Github, whose
// `FileType` field should be empty.
document := doc.Document{
FilePath: k.Path,
DefaultBranch: defaultBranch,
RepositoryURL: k.Repository.URL,
User: doc.UserName(k.Repository.URL),
}
if seen.Seen(document.ID()) {
@@ -304,6 +312,7 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap,
FilePath: k.Path,
DefaultBranch: defaultBranch,
RepositoryURL: k.Repository.URL,
User: doc.UserName(k.Repository.URL),
},
}
creationTime, err := gcl.GetFileCreationTime(k)