diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index d87c47996..aa231cdf4 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -191,7 +191,7 @@ func main() { // this greatly reduces the time overhead of CrawlGithub. getSeedDocsFunc() for _, d := range seedDocs { - seen[d.ID()] = d.FileType + seen.Set(d.ID(), d.FileType) } crawler.CrawlGithub(ctx, crawlers, docConverter, indexFunc, seen) case CrawlUser: diff --git a/api/internal/crawl/utils/utils.go b/api/internal/crawl/utils/utils.go index d6b6fab68..f79454003 100644 --- a/api/internal/crawl/utils/utils.go +++ b/api/internal/crawl/utils/utils.go @@ -1,21 +1,37 @@ package utils -type SeenMap map[string]string +import "sync" +type SeenMap struct { + data map[string]string + lock sync.RWMutex +} + +// TODO: add lock to avoid race condition func (seen SeenMap) Seen(item string) bool { - _, ok := seen[item] + seen.lock.RLock() + _, ok := seen.data[item] + seen.lock.RUnlock() return ok } func (seen SeenMap) Set(k, v string) { - seen[k] = v + seen.lock.Lock() + seen.data[k] = v + seen.lock.Unlock() } // The caller should make sure that key is in the map. func (seen SeenMap) Value(k string) string { - return seen[k] + seen.lock.RLock() + v := seen.data[k] + seen.lock.RUnlock() + return v } func NewSeenMap() SeenMap { - return make(map[string]string) + return SeenMap{ + data: make(map[string]string), + lock: sync.RWMutex{}, + } }