// Copyright 2018 Adam Tauber // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package storage import ( "net/http" "net/http/cookiejar" "net/url" "strings" "sync" ) // Storage is an interface which handles Collector's internal data, // like visited urls and cookies. // The default Storage of the Collector is the InMemoryStorage. // Collector's storage can be changed by calling Collector.SetStorage() // function. type Storage interface { // Init initializes the storage Init() error // Visited receives and stores a request ID that is visited by the Collector Visited(requestID uint64) error // IsVisited returns true if the request was visited before IsVisited // is called IsVisited(requestID uint64) (bool, error) // Cookies retrieves stored cookies for a given host Cookies(u *url.URL) string // SetCookies stores cookies for a given host SetCookies(u *url.URL, cookies string) } // InMemoryStorage is the default storage backend of colly. // InMemoryStorage keeps cookies and visited urls in memory // without persisting data on the disk. type InMemoryStorage struct { visitedURLs map[uint64]bool lock *sync.RWMutex jar *cookiejar.Jar } // Init initializes InMemoryStorage func (s *InMemoryStorage) Init() error { if s.visitedURLs == nil { s.visitedURLs = make(map[uint64]bool) } if s.lock == nil { s.lock = &sync.RWMutex{} } if s.jar == nil { var err error s.jar, err = cookiejar.New(nil) return err } return nil } // Visited implements Storage.Visited() func (s *InMemoryStorage) Visited(requestID uint64) error { s.lock.Lock() s.visitedURLs[requestID] = true s.lock.Unlock() return nil } // IsVisited implements Storage.IsVisited() func (s *InMemoryStorage) IsVisited(requestID uint64) (bool, error) { s.lock.RLock() visited := s.visitedURLs[requestID] s.lock.RUnlock() return visited, nil } // Cookies implements Storage.Cookies() func (s *InMemoryStorage) Cookies(u *url.URL) string { return StringifyCookies(s.jar.Cookies(u)) } // SetCookies implements Storage.SetCookies() func (s *InMemoryStorage) SetCookies(u *url.URL, cookies string) { s.jar.SetCookies(u, UnstringifyCookies(cookies)) } // Close implements Storage.Close() func (s *InMemoryStorage) Close() error { return nil } // StringifyCookies serializes list of http.Cookies to string func StringifyCookies(cookies []*http.Cookie) string { // Stringify cookies. cs := make([]string, len(cookies)) for i, c := range cookies { cs[i] = c.String() } return strings.Join(cs, "\n") } // UnstringifyCookies deserializes a cookie string to http.Cookies func UnstringifyCookies(s string) []*http.Cookie { h := http.Header{} for _, c := range strings.Split(s, "\n") { h.Add("Set-Cookie", c) } r := http.Response{Header: h} return r.Cookies() } // ContainsCookie checks if a cookie name is represented in cookies func ContainsCookie(cookies []*http.Cookie, name string) bool { for _, c := range cookies { if c.Name == name { return true } } return false }