// Copyright 2018 Adam Tauber // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package colly import ( "bytes" "fmt" "io/ioutil" "mime" "net/http" "strings" "github.com/saintfish/chardet" "golang.org/x/net/html/charset" ) // Response is the representation of a HTTP response made by a Collector type Response struct { // StatusCode is the status code of the Response StatusCode int // Body is the content of the Response Body []byte // Ctx is a context between a Request and a Response Ctx *Context // Request is the Request object of the response Request *Request // Headers contains the Response's HTTP headers Headers *http.Header } // Save writes response body to disk func (r *Response) Save(fileName string) error { return ioutil.WriteFile(fileName, r.Body, 0644) } // FileName returns the sanitized file name parsed from "Content-Disposition" // header or from URL func (r *Response) FileName() string { _, params, err := mime.ParseMediaType(r.Headers.Get("Content-Disposition")) if fName, ok := params["filename"]; ok && err == nil { return SanitizeFileName(fName) } if r.Request.URL.RawQuery != "" { return SanitizeFileName(fmt.Sprintf("%s_%s", r.Request.URL.Path, r.Request.URL.RawQuery)) } return SanitizeFileName(strings.TrimPrefix(r.Request.URL.Path, "/")) } func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error { if defaultEncoding != "" { tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding) if err != nil { return err } r.Body = tmpBody return nil } contentType := strings.ToLower(r.Headers.Get("Content-Type")) if !strings.Contains(contentType, "charset") { if !detectCharset { return nil } d := chardet.NewTextDetector() r, err := d.DetectBest(r.Body) if err != nil { return err } contentType = "text/plain; charset=" + r.Charset } if strings.Contains(contentType, "utf-8") || strings.Contains(contentType, "utf8") { return nil } tmpBody, err := encodeBytes(r.Body, contentType) if err != nil { return err } r.Body = tmpBody return nil } func encodeBytes(b []byte, contentType string) ([]byte, error) { r, err := charset.NewReader(bytes.NewReader(b), contentType) if err != nil { return nil, err } return ioutil.ReadAll(r) }