Go is doing my head in. This took me a long long time to get out:
package main
import (
"fmt"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
type Url struct {
url string
depth int
}
type HistoryReq struct {
url string
resultChan chan bool
}
historian := func() chan HistoryReq {
history := map[string]bool{ url: true }
historyReqChan := make(chan HistoryReq)
go func() {
for request := range historyReqChan {
_, visited := history[request.url]
if !visited {
history[request.url] = true
}
request.resultChan <- visited
}
}()
return historyReqChan
}
produce := func(urlChannel chan Url, urlRequest Url, historyReqChan chan HistoryReq, doneCounterChan chan bool) {
getDepth, urlString := urlRequest.depth, urlRequest.url
if (getDepth <= 0) {
doneCounterChan <- false
return
}
body, urls, err := fetcher.Fetch(urlString)
if err != nil {
fmt.Println(err)
} else {
fmt.Printf("found: %s %qn", urlString, body)
for _, u := range urls {
historyResultChan := make(chan bool)
historyReq := HistoryReq{u, historyResultChan}
historyReqChan <- historyReq
if !<- historyResultChan {
urlChannel <- Url { u, getDepth - 1 }
}
}
}
doneCounterChan <- false
}
startWork := func(done chan bool, historyReqChan chan HistoryReq) chan Url {
counter := 0
doneCounterChan := make(chan bool)
urlChannel := make(chan Url)
go func() {
for {
select {
case task := <- urlChannel:
counter++
go produce(urlChannel, task, historyReqChan, doneCounterChan)
case <- doneCounterChan:
counter--
if counter == 0 {
done <- true
}
}
}
}()
return urlChannel
}
done := make(chan bool)
historyReqChan := historian()
startWork(done, historyReqChan) <- Url{url, depth}
<- done
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f *fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := (*f)[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
"http://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
}
Go’s use of channels for synchronisation takes some getting used to :/
Here’s an earlier, less parallel version:
package main
import (
"fmt"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
type Url struct {
url string
depth int
}
type History struct {
urls map[string]bool
lock chan bool
}
var produce func(urlChannel chan Url, done chan bool, history *History)
produce = func(urlChannel chan Url, done chan bool, history *History) {
urlToRetrieve := <- urlChannel
urlString, getDepth := urlToRetrieve.url, urlToRetrieve.depth
if getDepth <= 0 {
return
}
body, urls, err := fetcher.Fetch(urlString)
if err != nil {
fmt.Println(err)
} else {
fmt.Printf("found: %s %qn", urlString, body)
<- history.lock
for _, u := range urls {
if _, visited := history.urls[u]; !visited {
history.urls[u] = true
go produce(urlChannel, done, history)
urlChannel <- Url { u, getDepth - 1 }
}
}
select {
case history.lock <- true:
default:
done <- true;
}
}
}
history := &History{ map[string]bool{ url: true }, make(chan bool) }
urlChannel := make(chan Url)
done := make(chan bool)
go produce(urlChannel, done, history)
urlChannel <- Url{url, depth}
history.lock <- true
<- done
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f *fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := (*f)[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
"http://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
}