Go Language Tour #71 – Web Crawler

Go is doing my head in. This took me a long long time to get out:

package main

import (  
    "fmt"
)

type Fetcher interface {  
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {  
    type Url struct {
        url string
        depth int
    }
    type HistoryReq struct {
        url string
        resultChan chan bool
    }

    historian := func() chan HistoryReq {
        history := map[string]bool{ url: true }
        historyReqChan := make(chan HistoryReq)
        go func() {
            for request := range historyReqChan {
                _, visited := history[request.url]
                if !visited {
                    history[request.url] = true
                } 
                request.resultChan <- visited
            }
        }()
        return historyReqChan
    }

    produce := func(urlChannel chan Url, urlRequest Url, historyReqChan chan HistoryReq, doneCounterChan chan bool) {
        getDepth, urlString := urlRequest.depth, urlRequest.url
        if (getDepth <= 0) {
            doneCounterChan <- false
            return
        }

        body, urls, err := fetcher.Fetch(urlString)
        if err != nil {
            fmt.Println(err)
        } else {
            fmt.Printf("found: %s %qn", urlString, body)
            for _, u := range urls {
                historyResultChan := make(chan bool)
                historyReq := HistoryReq{u, historyResultChan}
                historyReqChan <- historyReq
                if !<- historyResultChan {
                    urlChannel <- Url { u, getDepth - 1 }
                }                                                   
          }
        }
        doneCounterChan <- false
    }
    startWork := func(done chan bool, historyReqChan chan HistoryReq) chan Url {
        counter := 0
        doneCounterChan := make(chan bool)
        urlChannel := make(chan Url)
        go func() {
            for {
                select {
                case task := <- urlChannel:
                    counter++
                    go produce(urlChannel, task, historyReqChan, doneCounterChan)
                case <- doneCounterChan:
                    counter--
                    if counter == 0 {
                        done <- true
                    }
                }
            }
        }()
        return urlChannel
    }
    done := make(chan bool)    
    historyReqChan := historian()
    startWork(done, historyReqChan) <- Url{url, depth}

    <- done
    return
}

func main() {  
    Crawl("http://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {  
    body string
    urls []string
}

func (f *fakeFetcher) Fetch(url string) (string, []string, error) {  
    if res, ok := (*f)[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{  
    "http://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "http://golang.org/pkg/",
            "http://golang.org/cmd/",
        },
    },
    "http://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "http://golang.org/",
            "http://golang.org/cmd/",
            "http://golang.org/pkg/fmt/",
            "http://golang.org/pkg/os/",
        },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
    "http://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
}

Go’s use of channels for synchronisation takes some getting used to :/

Here’s an earlier, less parallel version:

package main

import (  
    "fmt"
)

type Fetcher interface {  
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {  
    type Url struct {
        url string
        depth int
    }
    type History struct {
        urls map[string]bool
        lock chan bool
    }

    var produce func(urlChannel chan Url, done chan bool, history *History)
    produce = func(urlChannel chan Url, done chan bool, history *History) { 
        urlToRetrieve := <- urlChannel
        urlString, getDepth := urlToRetrieve.url, urlToRetrieve.depth
        if getDepth <= 0 {
            return
        }

        body, urls, err := fetcher.Fetch(urlString)
        if err != nil {
            fmt.Println(err)
        } else {
            fmt.Printf("found: %s %qn", urlString, body)
            <- history.lock 
            for _, u := range urls {
                if _, visited := history.urls[u]; !visited { 
                    history.urls[u] = true
                    go produce(urlChannel, done, history)
                    urlChannel <- Url { u, getDepth - 1 }
                }                                                   
            }
            select {
            case history.lock <- true:
            default:                      
                done <- true;
            }
        }
    }

    history := &History{ map[string]bool{ url: true }, make(chan bool) }
    urlChannel := make(chan Url)
    done := make(chan bool)

    go produce(urlChannel, done, history)
    urlChannel <- Url{url, depth}
    history.lock <- true

    <- done
    return
}

func main() {  
    Crawl("http://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {  
    body string
    urls []string
}

func (f *fakeFetcher) Fetch(url string) (string, []string, error) {  
    if res, ok := (*f)[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = &fakeFetcher{  
    "http://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "http://golang.org/pkg/",
            "http://golang.org/cmd/",
        },
    },
    "http://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "http://golang.org/",
            "http://golang.org/cmd/",
            "http://golang.org/pkg/fmt/",
            "http://golang.org/pkg/os/",
        },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
    "http://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
}

 

Leave a Reply

Your email address will not be published. Required fields are marked *