129 lines
3.4 KiB
Go
129 lines
3.4 KiB
Go
|
// Copyright 2018 Adam Tauber
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package storage
|
||
|
|
||
|
import (
|
||
|
"net/http"
|
||
|
"net/http/cookiejar"
|
||
|
"net/url"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
)
|
||
|
|
||
|
// Storage is an interface which handles Collector's internal data,
|
||
|
// like visited urls and cookies.
|
||
|
// The default Storage of the Collector is the InMemoryStorage.
|
||
|
// Collector's storage can be changed by calling Collector.SetStorage()
|
||
|
// function.
|
||
|
type Storage interface {
|
||
|
// Init initializes the storage
|
||
|
Init() error
|
||
|
// Visited receives and stores a request ID that is visited by the Collector
|
||
|
Visited(requestID uint64) error
|
||
|
// IsVisited returns true if the request was visited before IsVisited
|
||
|
// is called
|
||
|
IsVisited(requestID uint64) (bool, error)
|
||
|
// Cookies retrieves stored cookies for a given host
|
||
|
Cookies(u *url.URL) string
|
||
|
// SetCookies stores cookies for a given host
|
||
|
SetCookies(u *url.URL, cookies string)
|
||
|
}
|
||
|
|
||
|
// InMemoryStorage is the default storage backend of colly.
|
||
|
// InMemoryStorage keeps cookies and visited urls in memory
|
||
|
// without persisting data on the disk.
|
||
|
type InMemoryStorage struct {
|
||
|
visitedURLs map[uint64]bool
|
||
|
lock *sync.RWMutex
|
||
|
jar *cookiejar.Jar
|
||
|
}
|
||
|
|
||
|
// Init initializes InMemoryStorage
|
||
|
func (s *InMemoryStorage) Init() error {
|
||
|
if s.visitedURLs == nil {
|
||
|
s.visitedURLs = make(map[uint64]bool)
|
||
|
}
|
||
|
if s.lock == nil {
|
||
|
s.lock = &sync.RWMutex{}
|
||
|
}
|
||
|
if s.jar == nil {
|
||
|
var err error
|
||
|
s.jar, err = cookiejar.New(nil)
|
||
|
return err
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Visited implements Storage.Visited()
|
||
|
func (s *InMemoryStorage) Visited(requestID uint64) error {
|
||
|
s.lock.Lock()
|
||
|
s.visitedURLs[requestID] = true
|
||
|
s.lock.Unlock()
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// IsVisited implements Storage.IsVisited()
|
||
|
func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) {
|
||
|
s.lock.RLock()
|
||
|
visited := s.visitedURLs[requestID]
|
||
|
s.lock.RUnlock()
|
||
|
return visited, nil
|
||
|
}
|
||
|
|
||
|
// Cookies implements Storage.Cookies()
|
||
|
func (s *InMemoryStorage) Cookies(u *url.URL) string {
|
||
|
return StringifyCookies(s.jar.Cookies(u))
|
||
|
}
|
||
|
|
||
|
// SetCookies implements Storage.SetCookies()
|
||
|
func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) {
|
||
|
s.jar.SetCookies(u, UnstringifyCookies(cookies))
|
||
|
}
|
||
|
|
||
|
// Close implements Storage.Close()
|
||
|
func (s *InMemoryStorage) Close() error {
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// StringifyCookies serializes list of http.Cookies to string
|
||
|
func StringifyCookies(cookies []*http.Cookie) string {
|
||
|
// Stringify cookies.
|
||
|
cs := make([]string, len(cookies))
|
||
|
for i, c := range cookies {
|
||
|
cs[i] = c.String()
|
||
|
}
|
||
|
return strings.Join(cs, "\n")
|
||
|
}
|
||
|
|
||
|
// UnstringifyCookies deserializes a cookie string to http.Cookies
|
||
|
func UnstringifyCookies(s string) []*http.Cookie {
|
||
|
h := http.Header{}
|
||
|
for _, c := range strings.Split(s, "\n") {
|
||
|
h.Add("Set-Cookie", c)
|
||
|
}
|
||
|
r := http.Response{Header: h}
|
||
|
return r.Cookies()
|
||
|
}
|
||
|
|
||
|
// ContainsCookie checks if a cookie name is represented in cookies
|
||
|
func ContainsCookie(cookies []*http.Cookie, name string) bool {
|
||
|
for _, c := range cookies {
|
||
|
if c.Name == name {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|