package xmlquery import ( "bytes" "encoding/xml" "errors" "fmt" "io" "net/http" "strings" "golang.org/x/net/html/charset" ) // A NodeType is the type of a Node. type NodeType uint const ( // DocumentNode is a document object that, as the root of the document tree, // provides access to the entire XML document. DocumentNode NodeType = iota // DeclarationNode is the document type declaration, indicated by the following // tag (for example, ). DeclarationNode // ElementNode is an element (for example, ). ElementNode // TextNode is the text content of a node. TextNode // CommentNode a comment (for example, ). CommentNode // AttributeNode is an attribute of element. AttributeNode ) // A Node consists of a NodeType and some Data (tag name for // element nodes, content for text) and are part of a tree of Nodes. type Node struct { Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node Type NodeType Data string Prefix string NamespaceURI string Attr []xml.Attr level int // node level in the tree } // InnerText returns the text between the start and end tags of the object. func (n *Node) InnerText() string { var output func(*bytes.Buffer, *Node) output = func(buf *bytes.Buffer, n *Node) { switch n.Type { case TextNode: buf.WriteString(n.Data) return case CommentNode: return } for child := n.FirstChild; child != nil; child = child.NextSibling { output(buf, child) } } var buf bytes.Buffer output(&buf, n) return buf.String() } func outputXML(buf *bytes.Buffer, n *Node) { if n.Type == TextNode || n.Type == CommentNode { xml.EscapeText(buf, []byte(strings.TrimSpace(n.Data))) return } if n.Type == DeclarationNode { buf.WriteString("") } else { buf.WriteString(">") } for child := n.FirstChild; child != nil; child = child.NextSibling { outputXML(buf, child) } if n.Type != DeclarationNode { if n.Prefix == "" { buf.WriteString(fmt.Sprintf("", n.Data)) }else{ buf.WriteString(fmt.Sprintf("", n.Prefix, n.Data)) } } } // OutputXML returns the text that including tags name. func (n *Node) OutputXML(self bool) string { var buf bytes.Buffer if self { outputXML(&buf, n) } else { for n := n.FirstChild; n != nil; n = n.NextSibling { outputXML(&buf, n) } } return buf.String() } func addAttr(n *Node, key, val string) { var attr xml.Attr if i := strings.Index(key, ":"); i > 0 { attr = xml.Attr{ Name: xml.Name{Space: key[:i], Local: key[i+1:]}, Value: val, } } else { attr = xml.Attr{ Name: xml.Name{Local: key}, Value: val, } } n.Attr = append(n.Attr, attr) } func addChild(parent, n *Node) { n.Parent = parent if parent.FirstChild == nil { parent.FirstChild = n } else { parent.LastChild.NextSibling = n n.PrevSibling = parent.LastChild } parent.LastChild = n } func addSibling(sibling, n *Node) { for t := sibling.NextSibling; t != nil; t = t.NextSibling { sibling = t } n.Parent = sibling.Parent sibling.NextSibling = n n.PrevSibling = sibling if sibling.Parent != nil { sibling.Parent.LastChild = n } } // LoadURL loads the XML document from the specified URL. func LoadURL(url string) (*Node, error) { resp, err := http.Get(url) if err != nil { return nil, err } defer resp.Body.Close() return parse(resp.Body) } func parse(r io.Reader) (*Node, error) { var ( decoder = xml.NewDecoder(r) doc = &Node{Type: DocumentNode} space2prefix = make(map[string]string) level = 0 ) decoder.CharsetReader = charset.NewReaderLabel prev := doc for { tok, err := decoder.Token() switch { case err == io.EOF: goto quit case err != nil: return nil, err } switch tok := tok.(type) { case xml.StartElement: if level == 0 { // mising XML declaration node := &Node{Type: DeclarationNode, Data: "xml", level: 1} addChild(prev, node) level = 1 prev = node } // https://www.w3.org/TR/xml-names/#scoping-defaulting for _, att := range tok.Attr { if att.Name.Local == "xmlns" { space2prefix[att.Value] = "" } else if att.Name.Space == "xmlns" { space2prefix[att.Value] = att.Name.Local } } if tok.Name.Space != "" { if _, found := space2prefix[tok.Name.Space]; !found { return nil, errors.New("xmlquery: invalid XML document, namespace is missing") } } node := &Node{ Type: ElementNode, Data: tok.Name.Local, Prefix: space2prefix[tok.Name.Space], NamespaceURI: tok.Name.Space, Attr: tok.Attr, level: level, } //fmt.Println(fmt.Sprintf("start > %s : %d", node.Data, level)) if level == prev.level { addSibling(prev, node) } else if level > prev.level { addChild(prev, node) } else if level < prev.level { for i := prev.level - level; i > 1; i-- { prev = prev.Parent } addSibling(prev.Parent, node) } prev = node level++ case xml.EndElement: level-- case xml.CharData: node := &Node{Type: TextNode, Data: string(tok), level: level} if level == prev.level { addSibling(prev, node) } else if level > prev.level { addChild(prev, node) } case xml.Comment: node := &Node{Type: CommentNode, Data: string(tok), level: level} if level == prev.level { addSibling(prev, node) } else if level > prev.level { addChild(prev, node) } case xml.ProcInst: // Processing Instruction if prev.Type != DeclarationNode { level++ } node := &Node{Type: DeclarationNode, Data: tok.Target, level: level} pairs := strings.Split(string(tok.Inst), " ") for _, pair := range pairs { pair = strings.TrimSpace(pair) if i := strings.Index(pair, "="); i > 0 { addAttr(node, pair[:i], strings.Trim(pair[i+1:], `"`)) } } if level == prev.level { addSibling(prev, node) } else if level > prev.level { addChild(prev, node) } prev = node case xml.Directive: } } quit: return doc, nil } // Parse returns the parse tree for the XML from the given Reader. func Parse(r io.Reader) (*Node, error) { return parse(r) }