// Copyright 2018 Adam Tauber // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package colly import ( "bytes" "fmt" "net/http" "net/http/httptest" "os" "reflect" "regexp" "strings" "testing" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/debug" ) var serverIndexResponse = []byte("hello world\n") var robotsFile = ` User-agent: * Allow: /allowed Disallow: /disallowed ` func newTestServer() *httptest.Server { mux := http.NewServeMux() mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write(serverIndexResponse) }) mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(`
This is a test page
This is a test paragraph
`)) }) mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { if r.Method == "POST" { w.Header().Set("Content-Type", "text/html") w.Write([]byte(r.FormValue("name"))) } }) mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(robotsFile)) }) mux.HandleFunc("/allowed", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte("allowed")) }) mux.HandleFunc("/disallowed", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte("disallowed")) }) mux.Handle("/redirect", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { http.Redirect(w, r, "/redirected/", http.StatusSeeOther) })) mux.Handle("/redirected/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { fmt.Fprintf(w, `test`) })) mux.HandleFunc("/set_cookie", func(w http.ResponseWriter, r *http.Request) { c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false} http.SetCookie(w, c) w.WriteHeader(200) w.Write([]byte("ok")) }) mux.HandleFunc("/check_cookie", func(w http.ResponseWriter, r *http.Request) { cs := r.Cookies() if len(cs) != 1 || r.Cookies()[0].Value != "testv" { w.WriteHeader(500) w.Write([]byte("nok")) return } w.WriteHeader(200) w.Write([]byte("ok")) }) mux.HandleFunc("/500", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.WriteHeader(500) w.Write([]byte("error
")) }) mux.HandleFunc("/user_agent", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(r.Header.Get("User-Agent"))) }) mux.HandleFunc("/base", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(` tags")
}
}
func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
visitCount := 0
c.OnRequest(func(r *Request) {
visitCount++
})
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 1 {
t.Error("URL revisited")
}
c.AllowURLRevisit = true
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 3 {
t.Error("URL not revisited")
}
}
func TestCollectorPost(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
})
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
})
}
func TestRedirect(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if !strings.HasSuffix(u, "/redirected/test") {
t.Error("Invalid URL after redirect: " + u)
}
})
c.OnResponse(func(r *Response) {
if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
t.Error("Invalid URL in Request after redirect: " + r.Request.URL.String())
}
})
c.Visit(ts.URL + "/redirect")
}
func TestBaseTag(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if u != "http://xy.com/z" {
t.Error("Invalid
tags") } } func BenchmarkOnHTML(b *testing.B) { ts := newTestServer() defer ts.Close() c := NewCollector() c.OnHTML("p", func(_ *HTMLElement) {}) for n := 0; n < b.N; n++ { c.Visit(fmt.Sprintf("%s/html?q=%d", ts.URL, n)) } } func BenchmarkOnXML(b *testing.B) { ts := newTestServer() defer ts.Close() c := NewCollector() c.OnXML("//p", func(_ *XMLElement) {}) for n := 0; n < b.N; n++ { c.Visit(fmt.Sprintf("%s/html?q=%d", ts.URL, n)) } } func BenchmarkOnResponse(b *testing.B) { ts := newTestServer() defer ts.Close() c := NewCollector() c.AllowURLRevisit = true c.OnResponse(func(_ *Response) {}) for n := 0; n < b.N; n++ { c.Visit(ts.URL) } }