// Copyright 2018 Adam Tauber // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package colly import ( "errors" "reflect" "strings" "github.com/PuerkitoBio/goquery" ) // Unmarshal is a shorthand for colly.UnmarshalHTML func (h *HTMLElement) Unmarshal(v interface{}) error { return UnmarshalHTML(v, h.DOM) } // UnmarshalHTML declaratively extracts text or attributes to a struct from // HTML response using struct tags composed of css selectors. // Allowed struct tags: // - "selector" (required): CSS (goquery) selector of the desired data // - "attr" (optional): Selects the matching element's attribute's value. // Leave it blank or omit to get the text of the element. // // Example struct declaration: // // type Nested struct { // String string `selector:"div > p"` // Classes []string `selector:"li" attr:"class"` // Struct *Nested `selector:"div > div"` // } // // Supported types: struct, *struct, string, []string func UnmarshalHTML(v interface{}, s *goquery.Selection) error { rv := reflect.ValueOf(v) if rv.Kind() != reflect.Ptr || rv.IsNil() { return errors.New("Invalid type or nil-pointer") } sv := rv.Elem() st := reflect.TypeOf(v).Elem() for i := 0; i < sv.NumField(); i++ { attrV := sv.Field(i) if !attrV.CanAddr() || !attrV.CanSet() { continue } if err := unmarshalAttr(s, attrV, st.Field(i)); err != nil { return err } } return nil } func unmarshalAttr(s *goquery.Selection, attrV reflect.Value, attrT reflect.StructField) error { selector := attrT.Tag.Get("selector") //selector is "-" specify that field should ignore. if selector == "-" { return nil } htmlAttr := attrT.Tag.Get("attr") // TODO support more types switch attrV.Kind() { case reflect.Slice: if err := unmarshalSlice(s, selector, htmlAttr, attrV); err != nil { return err } case reflect.String: val := getDOMValue(s.Find(selector), htmlAttr) attrV.Set(reflect.Indirect(reflect.ValueOf(val))) case reflect.Struct: if err := unmarshalStruct(s, selector, attrV); err != nil { return err } case reflect.Ptr: if err := unmarshalPtr(s, selector, attrV); err != nil { return err } default: return errors.New("Invalid type: " + attrV.String()) } return nil } func unmarshalStruct(s *goquery.Selection, selector string, attrV reflect.Value) error { newS := s if selector != "" { newS = newS.Find(selector) } if newS.Nodes == nil { return nil } v := reflect.New(attrV.Type()) err := UnmarshalHTML(v.Interface(), newS) if err != nil { return err } attrV.Set(reflect.Indirect(v)) return nil } func unmarshalPtr(s *goquery.Selection, selector string, attrV reflect.Value) error { newS := s if selector != "" { newS = newS.Find(selector) } if newS.Nodes == nil { return nil } e := attrV.Type().Elem() if e.Kind() != reflect.Struct { return errors.New("Invalid slice type") } v := reflect.New(e) err := UnmarshalHTML(v.Interface(), newS) if err != nil { return err } attrV.Set(v) return nil } func unmarshalSlice(s *goquery.Selection, selector, htmlAttr string, attrV reflect.Value) error { if attrV.Pointer() == 0 { v := reflect.MakeSlice(attrV.Type(), 0, 0) attrV.Set(v) } switch attrV.Type().Elem().Kind() { case reflect.String: s.Find(selector).Each(func(_ int, s *goquery.Selection) { val := getDOMValue(s, htmlAttr) attrV.Set(reflect.Append(attrV, reflect.Indirect(reflect.ValueOf(val)))) }) case reflect.Ptr: s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) { someVal := reflect.New(attrV.Type().Elem().Elem()) UnmarshalHTML(someVal.Interface(), innerSel) attrV.Set(reflect.Append(attrV, someVal)) }) case reflect.Struct: s.Find(selector).Each(func(_ int, innerSel *goquery.Selection) { someVal := reflect.New(attrV.Type().Elem()) UnmarshalHTML(someVal.Interface(), innerSel) attrV.Set(reflect.Append(attrV, reflect.Indirect(someVal))) }) default: return errors.New("Invalid slice type") } return nil } func getDOMValue(s *goquery.Selection, attr string) string { if attr == "" { return strings.TrimSpace(s.First().Text()) } attrV, _ := s.Attr(attr) return attrV }