We should be handing our downstream code a more flexible object to work with, but still not far from current convenience. Something that does not parse HTML when the downstream code doesn't WANT to parse it.
func Handler(req ScraperRequest, resp ServerResponse) {
root := resp.Parse() // Existing use case, consumes resp.Body to make Node
title_element := root.Find("title")[0]
req.Remarks.Printf("title = %s\n", title_element.Text())
req.Remarks.Printf("Started %v, lasted %v\n",
req.Stats.Started,
req.Stats.Duration,
)
expected_bytes := resp.Response.ContentLength
got_bytes := resp.BytesRead
if got_bytes != expected_bytes {
req.Remarks.Printf("Expected %d bytes, got %d\n",
expected_bytes,
got_bytes
)
}
root.Find("a").Queue()
}
var my_regexp = regexp.MustCompile("foo")
func GreppyHandler(req ScraperRequest, resp ServerResponse) {
data := ioutil.ReadAll(resp.Body) // Or, resp.ReadAll()
matches := my_regexp.FindAll(data, -1)
req.Remarks.Printf("Found %d matches\n", len(matches))
}
This also demonstrates some unrelated, but planned, API improvements (for example, Node.Text()).