121 lines
3.8 KiB
Go
121 lines
3.8 KiB
Go
|
// Copyright 2018 Adam Tauber
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package colly
|
||
|
|
||
|
import (
|
||
|
"strings"
|
||
|
|
||
|
"github.com/PuerkitoBio/goquery"
|
||
|
"golang.org/x/net/html"
|
||
|
)
|
||
|
|
||
|
// HTMLElement is the representation of a HTML tag.
|
||
|
type HTMLElement struct {
|
||
|
// Name is the name of the tag
|
||
|
Name string
|
||
|
Text string
|
||
|
attributes []html.Attribute
|
||
|
// Request is the request object of the element's HTML document
|
||
|
Request *Request
|
||
|
// Response is the Response object of the element's HTML document
|
||
|
Response *Response
|
||
|
// DOM is the goquery parsed DOM object of the page. DOM is relative
|
||
|
// to the current HTMLElement
|
||
|
DOM *goquery.Selection
|
||
|
// Index stores the position of the current element within all the elements matched by an OnHTML callback
|
||
|
Index int
|
||
|
}
|
||
|
|
||
|
// NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
|
||
|
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement {
|
||
|
return &HTMLElement{
|
||
|
Name: n.Data,
|
||
|
Request: resp.Request,
|
||
|
Response: resp,
|
||
|
Text: goquery.NewDocumentFromNode(n).Text(),
|
||
|
DOM: s,
|
||
|
Index: idx,
|
||
|
attributes: n.Attr,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Attr returns the selected attribute of a HTMLElement or empty string
|
||
|
// if no attribute found
|
||
|
func (h *HTMLElement) Attr(k string) string {
|
||
|
for _, a := range h.attributes {
|
||
|
if a.Key == k {
|
||
|
return a.Val
|
||
|
}
|
||
|
}
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
// ChildText returns the concatenated and stripped text content of the matching
|
||
|
// elements.
|
||
|
func (h *HTMLElement) ChildText(goquerySelector string) string {
|
||
|
return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
|
||
|
}
|
||
|
|
||
|
// ChildAttr returns the stripped text content of the first matching
|
||
|
// element's attribute.
|
||
|
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
|
||
|
if attr, ok := h.DOM.Find(goquerySelector).Attr(attrName); ok {
|
||
|
return strings.TrimSpace(attr)
|
||
|
}
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
// ChildAttrs returns the stripped text content of all the matching
|
||
|
// element's attributes.
|
||
|
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string {
|
||
|
var res []string
|
||
|
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
|
||
|
if attr, ok := s.Attr(attrName); ok {
|
||
|
res = append(res, strings.TrimSpace(attr))
|
||
|
}
|
||
|
})
|
||
|
return res
|
||
|
}
|
||
|
|
||
|
// ForEach iterates over the elements matched by the first argument
|
||
|
// and calls the callback function on every HTMLElement match.
|
||
|
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement)) {
|
||
|
i := 0
|
||
|
h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
|
||
|
for _, n := range s.Nodes {
|
||
|
callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i))
|
||
|
i++
|
||
|
}
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// ForEachWithBreak iterates over the elements matched by the first argument
|
||
|
// and calls the callback function on every HTMLElement match.
|
||
|
// It is identical to ForEach except that it is possible to break
|
||
|
// out of the loop by returning false in the callback function. It returns the
|
||
|
// current Selection object.
|
||
|
func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool) {
|
||
|
i := 0
|
||
|
h.DOM.Find(goquerySelector).EachWithBreak(func(_ int, s *goquery.Selection) bool {
|
||
|
for _, n := range s.Nodes {
|
||
|
if callback(i, NewHTMLElementFromSelectionNode(h.Response, s, n, i)) {
|
||
|
i++
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
})
|
||
|
}
|