Implementation of github query helper library.

To make this easier to read, use, and modify, I've abstracted the
important parts of the github query api into crawler/github/query.go
which allows to describe at a high level what is to be searched without
knowing the API syntax.
This commit is contained in:
Damien Robichaud
2019-07-09 13:30:12 -07:00
parent ca41674df3
commit ac6918d70f
2 changed files with 348 additions and 0 deletions

View File

@@ -0,0 +1,229 @@
package github
import (
"fmt"
"net/url"
"strings"
)
const (
perPageArg = "per_page"
accessTokenArg = "access_token"
githubMaxPageSize = 100
)
// Implementation detail, not important to external API.
type queryField struct {
name string
value interface{}
}
// Formats a query field.
func (qf queryField) String() string {
var value string
switch v := qf.value.(type) {
case string:
value = v
case rangeFormatter:
value = v.RangeString()
default:
value = fmt.Sprint(v)
}
if qf.name == "" {
return value
}
return fmt.Sprint(qf.name, ":", value)
}
// Example of formating a query:
// QueryWith(
// Filename("kustomization.yaml"),
// Filesize(RangeWithin{64, 192}),
// Keyword("copyright"),
// Keyword("2019"),
// ).String()
//
// Outputs "q=filename:kustomization.yaml+size:64..192+copyright+2018" which
// would search for files that have [64, 192] bytes (inclusive range) and that
// contain the keywords 'copyright' and '2019' somewhere in the file.
type Query []queryField
func QueryWith(qfs ...queryField) Query {
return Query(qfs)
}
func (q Query) String() string {
strs := make([]string, 0, len(q))
for _, elem := range q {
str := elem.String()
if str == "" {
continue
}
strs = append(strs, str)
}
query := strings.Join(strs, "+")
if query == "" {
return query
}
return "q=" + query
}
// Keyword takes a single word, and formats it according to the Github API.
func Keyword(k string) queryField {
return queryField{value: k}
}
// Filesize takes a rangeFormatter and formats it according to the Github API.
func Filesize(r rangeFormatter) queryField {
return queryField{name: "size", value: r}
}
// Filename takes a filename and formats it according to the Github API.
func Filename(f string) queryField {
return queryField{name: "filename", value: f}
}
// Path takes a filepath and formats it according to the Github API.
func Path(p string) queryField {
return queryField{name: "path", value: p}
}
// RequestConfig stores common variables that must be present for the queries.
// - CodeSearchRequests: ask Github to check the code indices given a query.
// - ContentsRequests: ask Github where to download a resource given a repo and a
// file path.
// - CommitsRequests: asks Github to list commits made one a file. Useful to
// determine the date of a file.
type RequestConfig struct {
perPage uint64
retryCount uint64
accessToken string
}
func NewRequestConfig(
perPage, retryCount uint64, accessToken string) RequestConfig {
return RequestConfig{
perPage: perPage,
retryCount: retryCount,
accessToken: accessToken,
}
}
// CodeSearchRequestWith given a list of query parameters that specify the
// (patial) query, returns a request object with the (parital) query. Must call
// the URL method to get the string value of the URL. See request.CopyWith, to
// understand why the request object is useful.
func (rc RequestConfig) CodeSearchRequestWith(query Query) request {
req := rc.makeRequest("search/code", query)
req.vals.Set("sort", "indexed")
req.vals.Set("order", "desc")
return req
}
// ContentsRequest given the repo name, and the filepath returns a formatted
// query for the Github API to find the dowload information of this filepath.
func (rc RequestConfig) ContentsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/contents/%s", fullRepoName, path)
return rc.makeRequest(uri, Query{}).URL()
}
// CommitsRequest given the repo name, and a filepath returns a formatted query
// for the Github API to find the commits that affect this file.
func (rc RequestConfig) CommitsRequest(fullRepoName, path string) string {
uri := fmt.Sprintf("repos/%s/commits", fullRepoName)
return rc.makeRequest(uri, Query{Path(path)}).URL()
}
// How many times to retry the queries before giving up (used by the crawler,
// not Github).
func (rc RequestConfig) RetryCount() uint64 {
return rc.retryCount
}
func (rc RequestConfig) makeRequest(path string, query Query) request {
vals := url.Values{}
if rc.accessToken != "" {
vals.Set(accessTokenArg, rc.accessToken)
}
vals.Set(perPageArg, fmt.Sprint(rc.perPage))
return request{
url: url.URL{
Scheme: "https",
Host: "api.github.com",
Path: path,
},
vals: vals,
query: query,
}
}
type request struct {
url url.URL
vals url.Values
query Query
}
// CopyWith copies the requests and adds the extra query parameters. Usefull
// for dynamically adding sizes to a filename only query without modifying it.
func (r request) CopyWith(queryParams ...queryField) request {
cpy := r
cpy.query = append(cpy.query, queryParams...)
return cpy
}
// URL encodes the variables and the URL representation into a string.
func (r request) URL() string {
// Github does not handle URL encoding properly in its API for the
// q='...', so the query parameter is added without any encoding
// manually.
encoded := r.vals.Encode()
query := r.query.String()
sep := "&"
if query == "" {
sep = ""
}
if encoded == "" && query != "" {
sep = "?"
}
r.url.RawQuery = encoded + sep + query
return r.url.String()
}
// Allows to define a range of numbers and print it in the github range
// query format https://help.github.com/en/articles/understanding-the-search-syntax.
type rangeFormatter interface {
RangeString() string
}
// RangeLessThan is a range of values strictly less than (<) size.
type RangeLessThan struct {
size uint64
}
func (r RangeLessThan) RangeString() string {
return fmt.Sprintf("<%d", r.size)
}
// RangeLessThan is a range of values strictly greater than (>) size.
type RangeGreaterThan struct {
size uint64
}
func (r RangeGreaterThan) RangeString() string {
return fmt.Sprintf(">%d", r.size)
}
// RangeWithin is an inclusive range from start to end.
type RangeWithin struct {
start uint64
end uint64
}
func (r RangeWithin) RangeString() string {
return fmt.Sprintf("%d..%d", r.start, r.end)
}

View File

@@ -0,0 +1,119 @@
package github
import (
"testing"
)
func TestQueryFields(t *testing.T) {
testCases := []struct {
formatter queryField
expected string
}{
{
formatter: Keyword("keyword"),
expected: "keyword",
},
{
formatter: Filesize(RangeLessThan{23}),
expected: "size:<23",
},
{
formatter: Filesize(RangeWithin{24, 64}),
expected: "size:24..64",
},
{
formatter: Filesize(RangeGreaterThan{64}),
expected: "size:>64",
},
{
formatter: Path("some/path/to/file"),
expected: "path:some/path/to/file",
},
{
formatter: Filename("kustomization.yaml"),
expected: "filename:kustomization.yaml",
},
}
for _, test := range testCases {
if result := test.formatter.String(); result != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.formatter, result, test.expected)
}
}
}
func TestQueryType(t *testing.T) {
testCases := []struct {
query Query
expected string
}{
{
query: QueryWith(
Filesize(RangeWithin{24, 64}),
Filename("kustomization.yaml"),
Keyword("keyword1"),
Keyword("keyword2"),
),
expected: "q=size:24..64+filename:kustomization.yaml+keyword1+keyword2",
},
}
for _, test := range testCases {
if queryStr := test.query.String(); queryStr != test.expected {
t.Errorf("got (%#v = %s), expected %s", test.query, queryStr, test.expected)
}
}
}
func TestGithubSearchQuery(t *testing.T) {
const (
accessToken = "random_token"
perPage = 100
)
testCases := []struct {
rc RequestConfig
codeQuery Query
fullRepoName string
path string
expectedCodeQuery string
expectedContentsQuery string
expectedCommitsQuery string
}{
{
rc: RequestConfig{
perPage: perPage,
accessToken: accessToken,
},
codeQuery: Query{
Filename("kustomization.yaml"),
Filesize(RangeWithin{64, 128}),
},
fullRepoName: "kubernetes-sigs/kustomize",
path: "examples/helloWorld/kustomization.yaml",
expectedCodeQuery: "https://api.github.com/search/code?" +
"access_token=random_token&order=desc&per_page=100&sort=indexed&q=filename:kustomization.yaml+size:64..128",
expectedContentsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/contents/" +
"examples/helloWorld/kustomization.yaml?access_token=random_token&per_page=100",
expectedCommitsQuery: "https://api.github.com/repos/kubernetes-sigs/kustomize/commits?" +
"access_token=random_token&per_page=100&q=path:examples/helloWorld/kustomization.yaml",
},
}
for _, test := range testCases {
if result := test.rc.CodeSearchRequestWith(test.codeQuery).URL(); result != test.expectedCodeQuery {
t.Errorf("Got code query: %s, expected %s", result, test.expectedCodeQuery)
}
if result := test.rc.ContentsRequest(test.fullRepoName, test.path); result != test.expectedContentsQuery {
t.Errorf("Got contents query: %s, expected %s", result, test.expectedContentsQuery)
}
if result := test.rc.CommitsRequest(test.fullRepoName, test.path); result != test.expectedCommitsQuery {
t.Errorf("Got commits query: %s, expected %s", result, test.expectedCommitsQuery)
}
}
}