Mulitple improvements of the crawler

1) Set document IDs to avoid duplicating documents;
2) Set the `creationTime` field of each document in the index;
3) set the `values`, `kinds` and `identifiers` fields for all documents;
4) Add a `Copy` method into the `Document` struct: this fixes the issue
where all the documents existing in the index point to the same Document
object;
5) Avoid using keystore redis;
6) Set imagePullPolicy to `Always` for crawler jobs.
This commit is contained in:
Haiyan Meng
2019-12-05 09:51:22 -08:00
parent 54b1549586
commit bffc0d7071
13 changed files with 125 additions and 36 deletions

View File

@@ -133,8 +133,12 @@ func (gc githubCrawler) FetchDocument(_ context.Context, d *doc.Document) error
}
func (gc githubCrawler) SetCreated(_ context.Context, d *doc.Document) error {
fs := GhFileSpec{}
fs.Repository.FullName = d.RepositoryURL + "/" + d.FilePath
fs := GhFileSpec{
Path: d.FilePath,
Repository: GitRepository{
FullName: d.RepositoryFullName(),
},
}
creationTime, err := gc.client.GetFileCreationTime(fs)
if err != nil {
return err
@@ -185,9 +189,9 @@ func processQuery(ctx context.Context, gcl GhClient, query string,
for _, file := range page.Parsed.Items {
k, err := kustomizationResultAdapter(gcl, file)
if err != nil {
logger.Printf("kustomizationResultAdapter failed: %v", err)
errs = append(errs, err)
errorCnt++
continue
}
output <- k
totalCnt++
@@ -224,6 +228,18 @@ func kustomizationResultAdapter(gcl GhClient, k GhFileSpec) (
RepositoryURL: k.Repository.URL,
},
}
logger.Printf("Set the creationTime field")
creationTime, err := gcl.GetFileCreationTime(k)
if err != nil {
logger.Printf("GetFileCreationTime failed: %v", err)
return &d, err
}
d.CreationTime = &creationTime
if err := d.ParseYAML(); err != nil {
logger.Printf("ParseYAML failed: %v", err)
return &d, err
}
return &d, nil
}
@@ -410,13 +426,15 @@ func (e multiError) Error() string {
return strings.Join(strs, "\n")
}
type GitRepository struct {
API string `json:"url,omitempty"`
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
}
type GhFileSpec struct {
Path string `json:"path,omitempty"`
Repository struct {
API string `json:"url,omitempty"`
URL string `json:"html_url,omitempty"`
FullName string `json:"full_name,omitempty"`
} `json:"repository,omitempty"`
Path string `json:"path,omitempty"`
Repository GitRepository `json:"repository,omitempty"`
}
type githubResponse struct {

View File

@@ -7,7 +7,7 @@ import (
)
const (
perPageArg = "per_page"
perPageArg = "per_page"
)
const githubMaxPageSize = 100

View File

@@ -68,7 +68,7 @@ func TestQueryType(t *testing.T) {
func TestGithubSearchQuery(t *testing.T) {
const (
perPage = 100
perPage = 100
)
testCases := []struct {
@@ -82,7 +82,7 @@ func TestGithubSearchQuery(t *testing.T) {
}{
{
rc: RequestConfig{
perPage: perPage,
perPage: perPage,
},
codeQuery: Query{
Filename("kustomization.yaml"),