Go Tour Exercise: Web Crawler - all goroutines are asleep - deadlock
练习来自:https://tour.golang.org/concurrency/10
描述:
In this exercise you'll use Go's concurrency features to parallelize a web crawler.
Modify the Crawl function to fetch URLs in parallel without fetching the same URL twice.
Hint: you can keep a cache of the URLs that have been fetched on a map, but maps alone are not safe for concurrent use!
这是我的答案:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | package main import ( "fmt" "sync" ) type Fetcher interface { // Fetch returns the body of URL and // a slice of URLs found on that page. Fetch(url string) (body string, urls []string, err error) } var crawledURLs = make(map[string]bool) var mux sync.Mutex func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) { defer func() { quit <- true }() if depth <= 0 { return } mux.Lock() _, isCrawled := crawledURLs[url] if isCrawled { return } crawledURLs[url] = true mux.Unlock() body, urls, err := fetcher.Fetch(url) if err != nil { fmt.Println(err) return } fmt.Printf("found: %s %q\ ", url, body) quitThis := make(chan bool) for _, u := range urls { go CrawlURL(u, depth-1, fetcher, quitThis) } for range urls { <-quitThis } return } // Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { CrawlURL(url, depth, fetcher, make(chan bool)) return } func main() { Crawl("https://golang.org/", 4, fetcher) } // fakeFetcher is Fetcher that returns canned results. type fakeFetcher map[string]*fakeResult type fakeResult struct { body string urls []string } func (f fakeFetcher) Fetch(url string) (string, []string, error) { if res, ok := f[url]; ok { return res.body, res.urls, nil } return"", nil, fmt.Errorf("not found: %s", url) } // fetcher is a populated fakeFetcher. var fetcher = fakeFetcher{ "https://golang.org/": &fakeResult{ "The Go Programming Language", []string{ "https://golang.org/pkg/", "https://golang.org/cmd/", }, }, "https://golang.org/pkg/": &fakeResult{ "Packages", []string{ "https://golang.org/", "https://golang.org/cmd/", "https://golang.org/pkg/fmt/", "https://golang.org/pkg/os/", }, }, "https://golang.org/pkg/fmt/": &fakeResult{ "Package fmt", []string{ "https://golang.org/", "https://golang.org/pkg/", }, }, "https://golang.org/pkg/os/": &fakeResult{ "Package os", []string{ "https://golang.org/", "https://golang.org/pkg/", }, }, } |
并输出:
1 2 3 4 5 | found: https://golang.org/"The Go Programming Language" not found: https://golang.org/cmd/ found: https://golang.org/pkg/"Packages" found: https://golang.org/pkg/os/"Package os" fatal error: all goroutines are asleep - deadlock! |
我想知道为什么会发生死锁? 是因为我以错误的方式使用频道吗?
注意到我忘记释放
所以我编辑了这样的代码:
1 2 3 4 5 6 | ... if isCrawled { mux.Unlock() // added this line return } ... |
但是死锁仍然存在,并且输出是不同的:
1 2 3 4 5 6 | found: https://golang.org/"The Go Programming Language" not found: https://golang.org/cmd/ found: https://golang.org/pkg/"Packages" found: https://golang.org/pkg/os/"Package os" found: https://golang.org/pkg/fmt/"Package fmt" fatal error: all goroutines are asleep - deadlock! |
主要问题是您在返回
此外,如果您确实需要同步goroutine,我建议使用同步API。 通道更好地用于通信和共享数据。
这是使用
相反,这是您仅使用渠道的解决方案:https://play.golang.org/p/FbPXxPSXvFL
问题在于,第一次调用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | package main import ( "fmt" "sync" ) type Fetcher interface { // Fetch returns the body of URL and // a slice of URLs found on that page. Fetch(url string) (body string, urls []string, err error) } var crawledURLs = make(map[string]bool) var mux sync.Mutex func CrawlURL(url string, depth int, fetcher Fetcher, quit chan bool) { //For very first function instance, this would block forever if //nobody is receiving from the other end of this channel defer func() { quit <- true }() if depth <= 0 { return } mux.Lock() _, isCrawled := crawledURLs[url] if isCrawled { mux.Unlock() return } crawledURLs[url] = true mux.Unlock() body, urls, err := fetcher.Fetch(url) if err != nil { fmt.Println(err) return } fmt.Printf("found: %s %q\ ", url, body) quitThis := make(chan bool) for _, u := range urls { go CrawlURL(u, depth-1, fetcher, quitThis) } for range urls { <-quitThis } return } // Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { lastQuit := make(chan bool) go CrawlURL(url, depth, fetcher, lastQuit) //You need to receive from this channel in order to //unblock the called function <-lastQuit return } func main() { Crawl("https://golang.org/", 10, fetcher) } // fakeFetcher is Fetcher that returns canned results. type fakeFetcher map[string]*fakeResult type fakeResult struct { body string urls []string } func (f fakeFetcher) Fetch(url string) (string, []string, error) { if res, ok := f[url]; ok { return res.body, res.urls, nil } return"", nil, fmt.Errorf("not found: %s", url) } // fetcher is a populated fakeFetcher. var fetcher = fakeFetcher{ "https://golang.org/": &fakeResult{ "The Go Programming Language", []string{ "https://golang.org/pkg/", "https://golang.org/cmd/", }, }, "https://golang.org/pkg/": &fakeResult{ "Packages", []string{ "https://golang.org/", "https://golang.org/cmd/", "https://golang.org/pkg/fmt/", "https://golang.org/pkg/os/", }, }, "https://golang.org/pkg/fmt/": &fakeResult{ "Package fmt", []string{ "https://golang.org/", "https://golang.org/pkg/", }, }, "https://golang.org/pkg/os/": &fakeResult{ "Package os", []string{ "https://golang.org/", "https://golang.org/pkg/", }, }, } |