180 lines
5.3 KiB
Go
180 lines
5.3 KiB
Go
// Copyright 2018 Adam Tauber
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package colly
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync/atomic"
|
|
)
|
|
|
|
// Request is the representation of a HTTP request made by a Collector
|
|
type Request struct {
|
|
// URL is the parsed URL of the HTTP request
|
|
URL *url.URL
|
|
// Headers contains the Request's HTTP headers
|
|
Headers *http.Header
|
|
// Ctx is a context between a Request and a Response
|
|
Ctx *Context
|
|
// Depth is the number of the parents of the request
|
|
Depth int
|
|
// Method is the HTTP method of the request
|
|
Method string
|
|
// Body is the request body which is used on POST/PUT requests
|
|
Body io.Reader
|
|
// ResponseCharacterencoding is the character encoding of the response body.
|
|
// Leave it blank to allow automatic character encoding of the response body.
|
|
// It is empty by default and it can be set in OnRequest callback.
|
|
ResponseCharacterEncoding string
|
|
// ID is the Unique identifier of the request
|
|
ID uint32
|
|
collector *Collector
|
|
abort bool
|
|
baseURL *url.URL
|
|
// ProxyURL is the proxy address that handles the request
|
|
ProxyURL string
|
|
}
|
|
|
|
type serializableRequest struct {
|
|
URL string
|
|
Method string
|
|
Body []byte
|
|
ID uint32
|
|
Ctx map[string]interface{}
|
|
Headers http.Header
|
|
}
|
|
|
|
// New creates a new request with the context of the original request
|
|
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
|
|
u, err := url.Parse(URL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &Request{
|
|
Method: method,
|
|
URL: u,
|
|
Body: body,
|
|
Ctx: r.Ctx,
|
|
Headers: &http.Header{},
|
|
ID: atomic.AddUint32(&r.collector.requestCount, 1),
|
|
collector: r.collector,
|
|
}, nil
|
|
}
|
|
|
|
// Abort cancels the HTTP request when called in an OnRequest callback
|
|
func (r *Request) Abort() {
|
|
r.abort = true
|
|
}
|
|
|
|
// AbsoluteURL returns with the resolved absolute URL of an URL chunk.
|
|
// AbsoluteURL returns empty string if the URL chunk is a fragment or
|
|
// could not be parsed
|
|
func (r *Request) AbsoluteURL(u string) string {
|
|
if strings.HasPrefix(u, "#") {
|
|
return ""
|
|
}
|
|
var base *url.URL
|
|
if r.baseURL != nil {
|
|
base = r.baseURL
|
|
} else {
|
|
base = r.URL
|
|
}
|
|
absURL, err := base.Parse(u)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
absURL.Fragment = ""
|
|
if absURL.Scheme == "//" {
|
|
absURL.Scheme = r.URL.Scheme
|
|
}
|
|
return absURL.String()
|
|
}
|
|
|
|
// Visit continues Collector's collecting job by creating a
|
|
// request and preserves the Context of the previous request.
|
|
// Visit also calls the previously provided callbacks
|
|
func (r *Request) Visit(URL string) error {
|
|
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true)
|
|
}
|
|
|
|
// Post continues a collector job by creating a POST request and preserves the Context
|
|
// of the previous request.
|
|
// Post also calls the previously provided callbacks
|
|
func (r *Request) Post(URL string, requestData map[string]string) error {
|
|
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil, true)
|
|
}
|
|
|
|
// PostRaw starts a collector job by creating a POST request with raw binary data.
|
|
// PostRaw preserves the Context of the previous request
|
|
// and calls the previously provided callbacks
|
|
func (r *Request) PostRaw(URL string, requestData []byte) error {
|
|
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil, true)
|
|
}
|
|
|
|
// PostMultipart starts a collector job by creating a Multipart POST request
|
|
// with raw binary data. PostMultipart also calls the previously provided.
|
|
// callbacks
|
|
func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error {
|
|
boundary := randomBoundary()
|
|
hdr := http.Header{}
|
|
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
|
|
hdr.Set("User-Agent", r.collector.UserAgent)
|
|
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr, true)
|
|
}
|
|
|
|
// Retry submits HTTP request again with the same parameters
|
|
func (r *Request) Retry() error {
|
|
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
|
|
}
|
|
|
|
// Do submits the request
|
|
func (r *Request) Do() error {
|
|
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, !r.collector.AllowURLRevisit)
|
|
}
|
|
|
|
// Marshal serializes the Request
|
|
func (r *Request) Marshal() ([]byte, error) {
|
|
ctx := make(map[string]interface{})
|
|
if r.Ctx != nil {
|
|
r.Ctx.ForEach(func(k string, v interface{}) interface{} {
|
|
ctx[k] = v
|
|
return nil
|
|
})
|
|
}
|
|
var err error
|
|
var body []byte
|
|
if r.Body != nil {
|
|
body, err = ioutil.ReadAll(r.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
sr := &serializableRequest{
|
|
URL: r.URL.String(),
|
|
Method: r.Method,
|
|
Body: body,
|
|
ID: r.ID,
|
|
Ctx: ctx,
|
|
}
|
|
if r.Headers != nil {
|
|
sr.Headers = *r.Headers
|
|
}
|
|
return json.Marshal(sr)
|
|
}
|