Go is doing my head in. This took me a long long time to get out:
package main import ( "fmt" ) type Fetcher interface { // Fetch returns the body of URL and // a slice of URLs found on that page. Fetch(url string) (body string, urls []string, err error) } // Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { type Url struct { url string depth int } type HistoryReq struct { url string resultChan chan bool } historian := func() chan HistoryReq { history := map[string]bool{ url: true } historyReqChan := make(chan HistoryReq) go func() { for request := range historyReqChan { _, visited := history[request.url] if !visited { history[request.url] = true } request.resultChan <- visited } }() return historyReqChan } produce := func(urlChannel chan Url, urlRequest Url, historyReqChan chan HistoryReq, doneCounterChan chan bool) { getDepth, urlString := urlRequest.depth, urlRequest.url if (getDepth <= 0) { doneCounterChan <- false return } body, urls, err := fetcher.Fetch(urlString) if err != nil { fmt.Println(err) } else { fmt.Printf("found: %s %qn", urlString, body) for _, u := range urls { historyResultChan := make(chan bool) historyReq := HistoryReq{u, historyResultChan} historyReqChan <- historyReq if !<- historyResultChan { urlChannel <- Url { u, getDepth - 1 } } } } doneCounterChan <- false } startWork := func(done chan bool, historyReqChan chan HistoryReq) chan Url { counter := 0 doneCounterChan := make(chan bool) urlChannel := make(chan Url) go func() { for { select { case task := <- urlChannel: counter++ go produce(urlChannel, task, historyReqChan, doneCounterChan) case <- doneCounterChan: counter-- if counter == 0 { done <- true } } } }() return urlChannel } done := make(chan bool) historyReqChan := historian() startWork(done, historyReqChan) <- Url{url, depth} <- done return } func main() { Crawl("http://golang.org/", 4, fetcher) } // fakeFetcher is Fetcher that returns canned results. type fakeFetcher map[string]*fakeResult type fakeResult struct { body string urls []string } func (f *fakeFetcher) Fetch(url string) (string, []string, error) { if res, ok := (*f)[url]; ok { return res.body, res.urls, nil } return "", nil, fmt.Errorf("not found: %s", url) } // fetcher is a populated fakeFetcher. var fetcher = &fakeFetcher{ "http://golang.org/": &fakeResult{ "The Go Programming Language", []string{ "http://golang.org/pkg/", "http://golang.org/cmd/", }, }, "http://golang.org/pkg/": &fakeResult{ "Packages", []string{ "http://golang.org/", "http://golang.org/cmd/", "http://golang.org/pkg/fmt/", "http://golang.org/pkg/os/", }, }, "http://golang.org/pkg/fmt/": &fakeResult{ "Package fmt", []string{ "http://golang.org/", "http://golang.org/pkg/", }, }, "http://golang.org/pkg/os/": &fakeResult{ "Package os", []string{ "http://golang.org/", "http://golang.org/pkg/", }, }, }
Go’s use of channels for synchronisation takes some getting used to :/
Here’s an earlier, less parallel version:
package main import ( "fmt" ) type Fetcher interface { // Fetch returns the body of URL and // a slice of URLs found on that page. Fetch(url string) (body string, urls []string, err error) } // Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { type Url struct { url string depth int } type History struct { urls map[string]bool lock chan bool } var produce func(urlChannel chan Url, done chan bool, history *History) produce = func(urlChannel chan Url, done chan bool, history *History) { urlToRetrieve := <- urlChannel urlString, getDepth := urlToRetrieve.url, urlToRetrieve.depth if getDepth <= 0 { return } body, urls, err := fetcher.Fetch(urlString) if err != nil { fmt.Println(err) } else { fmt.Printf("found: %s %qn", urlString, body) <- history.lock for _, u := range urls { if _, visited := history.urls[u]; !visited { history.urls[u] = true go produce(urlChannel, done, history) urlChannel <- Url { u, getDepth - 1 } } } select { case history.lock <- true: default: done <- true; } } } history := &History{ map[string]bool{ url: true }, make(chan bool) } urlChannel := make(chan Url) done := make(chan bool) go produce(urlChannel, done, history) urlChannel <- Url{url, depth} history.lock <- true <- done return } func main() { Crawl("http://golang.org/", 4, fetcher) } // fakeFetcher is Fetcher that returns canned results. type fakeFetcher map[string]*fakeResult type fakeResult struct { body string urls []string } func (f *fakeFetcher) Fetch(url string) (string, []string, error) { if res, ok := (*f)[url]; ok { return res.body, res.urls, nil } return "", nil, fmt.Errorf("not found: %s", url) } // fetcher is a populated fakeFetcher. var fetcher = &fakeFetcher{ "http://golang.org/": &fakeResult{ "The Go Programming Language", []string{ "http://golang.org/pkg/", "http://golang.org/cmd/", }, }, "http://golang.org/pkg/": &fakeResult{ "Packages", []string{ "http://golang.org/", "http://golang.org/cmd/", "http://golang.org/pkg/fmt/", "http://golang.org/pkg/os/", }, }, "http://golang.org/pkg/fmt/": &fakeResult{ "Package fmt", []string{ "http://golang.org/", "http://golang.org/pkg/", }, }, "http://golang.org/pkg/os/": &fakeResult{ "Package os", []string{ "http://golang.org/", "http://golang.org/pkg/", }, }, }