From ac6918d70fe20b6937ae6a79a0840b8116118142 Mon Sep 17 00:00:00 2001 From: Damien Robichaud Date: Tue, 9 Jul 2019 13:30:12 -0700 Subject: [PATCH] Implementation of github query helper library. To make this easier to read, use, and modify, I've abstracted the important parts of the github query api into crawler/github/query.go which allows to describe at a high level what is to be searched without knowing the API syntax. --- internal/search/crawler/github/queries.go | 229 ++++++++++++++++++ .../search/crawler/github/queries_test.go | 119 +++++++++ 2 files changed, 348 insertions(+) create mode 100644 internal/search/crawler/github/queries.go create mode 100644 internal/search/crawler/github/queries_test.go diff --git a/internal/search/crawler/github/queries.go b/internal/search/crawler/github/queries.go new file mode 100644 index 000000000..7dc307914 --- /dev/null +++ b/internal/search/crawler/github/queries.go @@ -0,0 +1,229 @@ +package github + +import ( + "fmt" + "net/url" + "strings" +) + +const ( + perPageArg = "per_page" + accessTokenArg = "access_token" + + githubMaxPageSize = 100 +) + +// Implementation detail, not important to external API. +type queryField struct { + name string + value interface{} +} + +// Formats a query field. +func (qf queryField) String() string { + var value string + switch v := qf.value.(type) { + case string: + value = v + case rangeFormatter: + value = v.RangeString() + default: + value = fmt.Sprint(v) + } + + if qf.name == "" { + return value + } + return fmt.Sprint(qf.name, ":", value) +} + +// Example of formating a query: +// QueryWith( +// Filename("kustomization.yaml"), +// Filesize(RangeWithin{64, 192}), +// Keyword("copyright"), +// Keyword("2019"), +// ).String() +// +// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which +// would search for files that have [64, 192] bytes (inclusive range) and that +// contain the keywords 'copyright' and '2019' somewhere in the file. +type Query []queryField + +func QueryWith(qfs ...queryField) Query { + return Query(qfs) +} + +func (q Query) String() string { + strs := make([]string, 0, len(q)) + for _, elem := range q { + str := elem.String() + if str == "" { + continue + } + strs = append(strs, str) + } + + query := strings.Join(strs, "+") + if query == "" { + return query + } + return "q=" + query +} + +// Keyword takes a single word, and formats it according to the Github API. +func Keyword(k string) queryField { + return queryField{value: k} +} + +// Filesize takes a rangeFormatter and formats it according to the Github API. +func Filesize(r rangeFormatter) queryField { + return queryField{name: "size", value: r} +} + +// Filename takes a filename and formats it according to the Github API. +func Filename(f string) queryField { + return queryField{name: "filename", value: f} +} + +// Path takes a filepath and formats it according to the Github API. +func Path(p string) queryField { + return queryField{name: "path", value: p} +} + +// RequestConfig stores common variables that must be present for the queries. +// - CodeSearchRequests: ask Github to check the code indices given a query. +// - ContentsRequests: ask Github where to download a resource given a repo and a +// file path. +// - CommitsRequests: asks Github to list commits made one a file. Useful to +// determine the date of a file. +type RequestConfig struct { + perPage uint64 + retryCount uint64 + accessToken string +} + +func NewRequestConfig( + perPage, retryCount uint64, accessToken string) RequestConfig { + + return RequestConfig{ + perPage: perPage, + retryCount: retryCount, + accessToken: accessToken, + } +} + +// CodeSearchRequestWith given a list of query parameters that specify the +// (patial) query, returns a request object with the (parital) query. Must call +// the URL method to get the string value of the URL. See request.CopyWith, to +// understand why the request object is useful. +func (rc RequestConfig) CodeSearchRequestWith(query Query) request { + req := rc.makeRequest("search/code", query) + req.vals.Set("sort", "indexed") + req.vals.Set("order", "desc") + return req +} + +// ContentsRequest given the repo name, and the filepath returns a formatted +// query for the Github API to find the dowload information of this filepath. +func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path) + return rc.makeRequest(uri, Query{}).URL() +} + +// CommitsRequest given the repo name, and a filepath returns a formatted query +// for the Github API to find the commits that affect this file. +func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string { + uri := fmt.Sprintf("repos/%s/commits", fullRepoName) + return rc.makeRequest(uri, Query{Path(path)}).URL() +} + +// How many times to retry the queries before giving up (used by the crawler, +// not Github). +func (rc RequestConfig) RetryCount() uint64 { + return rc.retryCount +} + +func (rc RequestConfig) makeRequest(path string, query Query) request { + vals := url.Values{} + if rc.accessToken != "" { + vals.Set(accessTokenArg, rc.accessToken) + } + vals.Set(perPageArg, fmt.Sprint(rc.perPage)) + + return request{ + url: url.URL{ + Scheme: "https", + Host: "api.github.com", + Path: path, + }, + vals: vals, + query: query, + } +} + +type request struct { + url url.URL + vals url.Values + query Query +} + +// CopyWith copies the requests and adds the extra query parameters. Usefull +// for dynamically adding sizes to a filename only query without modifying it. +func (r request) CopyWith(queryParams ...queryField) request { + cpy := r + cpy.query = append(cpy.query, queryParams...) + return cpy +} + +// URL encodes the variables and the URL representation into a string. +func (r request) URL() string { + // Github does not handle URL encoding properly in its API for the + // q='...', so the query parameter is added without any encoding + // manually. + encoded := r.vals.Encode() + query := r.query.String() + sep := "&" + if query == "" { + sep = "" + } + if encoded == "" && query != "" { + sep = "?" + } + r.url.RawQuery = encoded + sep + query + return r.url.String() +} + +// Allows to define a range of numbers and print it in the github range +// query format https://help.github.com/en/articles/understanding-the-search-syntax. +type rangeFormatter interface { + RangeString() string +} + +// RangeLessThan is a range of values strictly less than (<) size. +type RangeLessThan struct { + size uint64 +} + +func (r RangeLessThan) RangeString() string { + return fmt.Sprintf("<%d", r.size) +} + +// RangeLessThan is a range of values strictly greater than (>) size. +type RangeGreaterThan struct { + size uint64 +} + +func (r RangeGreaterThan) RangeString() string { + return fmt.Sprintf(">%d", r.size) +} + +// RangeWithin is an inclusive range from start to end. +type RangeWithin struct { + start uint64 + end uint64 +} + +func (r RangeWithin) RangeString() string { + return fmt.Sprintf("%d..%d", r.start, r.end) +} diff --git a/internal/search/crawler/github/queries_test.go b/internal/search/crawler/github/queries_test.go new file mode 100644 index 000000000..98ef7d564 --- /dev/null +++ b/internal/search/crawler/github/queries_test.go @@ -0,0 +1,119 @@ +package github + +import ( + "testing" +) + +func TestQueryFields(t *testing.T) { + testCases := []struct { + formatter queryField + expected string + }{ + { + formatter: Keyword("keyword"), + expected: "keyword", + }, + { + formatter: Filesize(RangeLessThan{23}), + expected: "size:<23", + }, + { + formatter: Filesize(RangeWithin{24, 64}), + expected: "size:24..64", + }, + { + formatter: Filesize(RangeGreaterThan{64}), + expected: "size:>64", + }, + { + formatter: Path("some/path/to/file"), + expected: "path:some/path/to/file", + }, + { + formatter: Filename("kustomization.yaml"), + expected: "filename:kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.formatter.String(); result != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected) + } + } +} + +func TestQueryType(t *testing.T) { + testCases := []struct { + query Query + expected string + }{ + { + query: QueryWith( + Filesize(RangeWithin{24, 64}), + Filename("kustomization.yaml"), + Keyword("keyword1"), + Keyword("keyword2"), + ), + expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2", + }, + } + + for _, test := range testCases { + if queryStr := test.query.String(); queryStr != test.expected { + t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected) + } + + } +} + +func TestGithubSearchQuery(t *testing.T) { + const ( + accessToken = "random_token" + perPage = 100 + ) + + testCases := []struct { + rc RequestConfig + codeQuery Query + fullRepoName string + path string + expectedCodeQuery string + expectedContentsQuery string + expectedCommitsQuery string + }{ + { + rc: RequestConfig{ + perPage: perPage, + accessToken: accessToken, + }, + codeQuery: Query{ + Filename("kustomization.yaml"), + Filesize(RangeWithin{64, 128}), + }, + fullRepoName: "kubernetes-sigs/kustomize", + path: "examples/helloWorld/kustomization.yaml", + + expectedCodeQuery: "https://api.github.com/search/code?" + + "access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128", + + expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" + + "examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100", + + expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" + + "access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml", + }, + } + + for _, test := range testCases { + if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery { + t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery) + } + + if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery { + t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery) + } + if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery { + t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery) + } + } +}