最後更新: 2021-08-17
介紹
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt.
目錄
- 安裝
- XML Code Example
- Elements
- Test
- Siblings (or neighbours)
- Elements carry attributes as a dict
- Elements contain text (<root>text</root>)
- Parsing from strings and files
- Parser options
- Tree iteration
- Error log
- HTML
- Encode (UTF8)
- ElementPath
安裝
# Debian 7 (2.3.2-1+deb7u1)
apt-get install python-lxml
# Window 7
easy_install.exe lxml
XML Code Example
Code:
import lxml.etree as ET root = ET.Element("root") doc = ET.SubElement(root, "doc") # record 1 field1 = ET.SubElement(doc, "field1") field1.set("name", "blah") field1.text = "some value1" # record 2 field2 = ET.SubElement(doc, "field2") field2.set("name", "asdfasd") field2.text = "some vlaue2" tree = ET.ElementTree(root) tree.write("filename.xml")
filename.xml
<root> <doc> <field1 name="blah">some value1</field1> <field2 name="asdfasd">some vlaue2</field2> </doc> </root>
XML
<tag attrib>text</tag>
Elements
i.e.
import lxml.etree as ET root = ET.Element("root") child1 = ET.SubElement(root,"child1") child2 = ET.SubElement(root,"child2") child3 = ET.SubElement(root,"child3")
tostring
>>> print ET.tostring(root)
<root><child1/><child2/><child3/></root>
len
>>> print(len(root))
3
Access By Index
>>> child = root[0]
>>> print(child.tag)
child1
>>> ET.dump(root[0])
<child1/>
>>> ET.dump(child1)
<child1/>
>>> child = root[1]
>>> print(child.tag)
child2
Index
>>> root.index(root[2])
2
for
>>> for child in root:
... print child.tag
child1 child2 child3
Text
>>> child4 = ET.SubElement(root,"child4").text = "D"
>>> print ET.tostring(root)
<root><child1/><child2/><child3/><child4>D</child4></root>
# 頭尾
>>> root.insert(0, etree.Element("child0"))
>>> start = root[:1]
>>> end = root[-1:]
>>> print(start[0].tag)
child0
>>> print(end[0].tag)
child3
Test
# test if it's some kind of Element
>>> print(etree.iselement(root))
True
# test if it has children
>>> if len(root):
... print("The root element has children")
The root element has children
# Is root ?
root is root[0].getparent()
True
Elements carry attributes as a dict
.get . set .items 與 []
>>> root = etree.Element("root", interesting="totally")
>>> etree.tostring(root)
b'<root interesting="totally"/>'
>>> print(root.get("interesting"))
totally
>>> print(root.get("hello"))
None
set()
>>> root.set("hello", "Huhu")
>>> print(root.get("hello"))
Huhu
>>> for name, value in sorted(root.items()):
... print('%s = %r' % (name, value))
hello = 'Huhu'
interesting = 'totally'
# 會出 Error, 因為 Value 一定要係 String
root.find("Option[@Name='Pass']").text = 123456
# .attrib
>>> attributes = root.attrib
>>> print(attributes["interesting"])
totally
>>> print(attributes.get("no-such-attribute"))
None
>>> attributes["hello"] = "Guten Tag"
>>> print(attributes["hello"])
Guten Tag
>>> print(root.get("hello"))
Guten Tag
Siblings (or neighbours)
- getprevious () | getnect()
- tostring
- .tail
# getprevious () | getnect()
>>> root[0] is root[1].getprevious() # lxml.etree only!
True
>>> root[1] is root[0].getnext() # lxml.etree only!
True
# tail
>>> br.tail = "TAIL"
>>> etree.tostring(html)
b'<html><body>TEXT<br/>TAIL</body></html>'
# tostring(html, method="text")
* If you want to read only the text, i.e. without any intermediate tags
>>> etree.tostring(html, method="text")
b'TEXTTAIL'
Elements contain text (<root>text</root>)
>>> root = etree.Element("root")
>>> root.text = "TEXT"
>>> print(root.text)
TEXT
>>> etree.tostring(root)
b'<root>TEXT</root>'
Parsing from strings and files
Source: data.xml
<root> <doc> <field1 name="myfield1">some value1</field1> <field2 name="myfield2">some vlaue2</field2> </doc> </root>
Source: String
xml = '''\
<root>
<doc>
<field1 name="myfield1">some value1</field1>
<field2 name="myfield2">some vlaue2</field2>
</doc>
</root>
'''
# XML from file
tree = etree.parse("data.xml") # Returns the root element for this tree. root = tree.getroot()
StringIO:
xml = '<a xmlns="test"><b xmlns="test"/></a>' tree = etree.parse(StringIO(xml)) etree.tostring(tree.getroot())
# XML from String
root = etree.fromstring(xml)
etree.tostring(root, pretty_print=True)
# XML from function
The XML() function behaves like the fromstring() function,
but is commonly used to write XML literals(文字) right into the source
>>> root = etree.XML("<root>data</root>")
>>> print(root.tag)
root
>>> etree.tostring(root)
b'<root>data</root>'
# Incremental parsing (feed())
# To start parsing with a feed parser,
# just call its feed() method to feed it some data.
parser = etree.XMLParser()
for data in ('<?xml versio', 'n="1.0"?', '><roo', 't><a', '/></root>'):
parser.feed(data)
# When you are done parsing, you must call the close() method
# to retrieve the root Element of the parse
print(root.tag)
root
print(root[0].tag)
a
#### If you want to configure the parser
>>> parser = etree.XMLParser(remove_blank_text=True) # lxml.etree only!
>>> root = etree.XML("<root> <a/> <b> </b> </root>", parser)
>>> etree.tostring(root)
b'<root><a/><b> </b></root>'
Parser options
>>> parser = etree.XMLParser(ns_clean=True)
>>> tree = etree.parse(StringIO(xml), parser)
>>> etree.tostring(tree.getroot())
b'<a xmlns="test"><b/></a>'
Opts:
- ns_clean - try to clean up redundant namespace declarations
- recover - try hard to parse through broken XML
- remove_comments - discard comments
- remove_blank_text - discard blank text nodes between tags, also known as ignorable whitespace.
- compact - use compact storage for short text content (on by default)
- encoding - override the document encoding ( "utf-8", "big5" )
- schema - an XMLSchema to validate against (see validation)
Tree iteration
>>> root = etree.Element("root")
>>> etree.SubElement(root, "child").text = "Child 1"
>>> etree.SubElement(root, "child").text = "Child 2"
>>> etree.SubElement(root, "another").text = "Child 3"
>>> print(etree.tostring(root, pretty_print=True))
<root> <child>Child 1</child> <child>Child 2</child> <another>Child 3</another> </root>
>>> for element in root.iter():
... print("%s - %s" % (element.tag, element.text))
root - None child - Child 1 child - Child 2 another - Child 3
#~~~~~~~~~~~~~
* you can also pass more than one tag to intercept on multiple tags during iteration.
# lxml 3.0
>>> for element in root.iter("child"):
... print("%s - %s" % (element.tag, element.text))
child - Child 1 child - Child 2
>>> for element in root.iter("another", "child"):
... print("%s - %s" % (element.tag, element.text))
child - Child 1 child - Child 2 another - Child 3
#~~~~~~~~~~~~~
Append
>>> root.append(etree.Entity("#234"))
>>> root.append(etree.Comment("some comment"))
>>> for element in root.iter():
... if isinstance(element.tag, basestring):
... print("%s - %s" % (element.tag, element.text))
... else:
... print("SPECIAL: %s - %s" % (element, element.text))
root - None
child - Child 1
child - Child 2
another - Child 3
SPECIAL: ê - ê
SPECIAL: <!--some comment--> - some comment
>>> for element in root.iter(tag=etree.Element):
... print("%s - %s" % (element.tag, element.text))
root - None
child - Child 1
child - Child 2
another - Child 3
>>> for element in root.iter(tag=etree.Entity):
... print(element.text)
ê
Strip
>>> for element in root.iter("*"):
... if element.text is not None and not element.text.strip():
... element.text = None
remove
Modify
Error log
>>> parser = etree.XMLParser()
>>> print(len(parser.error_log))
0
Each entry in the log has the following properties:
message: the message text
domain: the domain ID (see the lxml.etree.ErrorDomains class)
type: the message type ID (see the lxml.etree.ErrorTypes class)
level: the log level ID (see the lxml.etree.ErrorLevels class)
line: the line at which the message originated (if applicable)
column: the character column at which the message originated (if applicable)
filename: the name of the file in which the message originated (if applicable)
HTML
page = urllib.urlopen('http://www.douban.com/note/153041669/')
doc = lxml.html.document_fromstring(page.read().decode('utf8','ignore'))
for idx, el in enumerate(doc.xpath(u'//a[@rel="nofollow"]')):
print el.attrib['href']
print el.text.decode('utf8')
Encode (UTF8)
As for XML serialisation, the default encoding for plain text serialisation is ASCII:
etree.tostring(root, encoding='unicode', method='text')
u'HelloW\xf6rld'
xml.py <- encoding='utf8'
utf8_parser = etree.XMLParser(encoding='utf8') f = open(data.xml) xml = f.read() f.close() doc = etree.parse(StringIO(xml), parser=utf8_parser) weather = doc.xpath('weatherinfos/weatherinfo/time_range/weather') if not weather[0].text: text = "None" else: text = weather[0].text # 輸出第一 weather 的 text print weather[0].tag + " => " + text # 輸出所有 weather for sf in weather: if not sf.text: text = "None" else: text = elem.text print elem.tag + " => " + text
ElementPath(find)
The goal is to support a small subset of the abbreviated syntax
Function:
-
find(match)
efficiently returns only the first match
match may be a tag name or path. Returns an element instance or None.
-
findtext(match[, default=None])
match may be a tag name or path.
returns the .text content of the first match element
-
findall(match)
returns a list of matching Elements
- iterfind(match) # iterates over all Elements that match the path expression
Syntax:
- . # Select the current node ( indicate that it’s a relative path ) ( "./tag" 與 "tag" 是沒有分別的)
- .. # Selects the parent element
- /tag # tag names, separated by slashes
- // # Selects all subelements(entire tree) ( ie. ".//" )
- * # Selects all child elements. ( "*/egg" all grandchildren named "egg")
- [@attrib]
- [@attrib='value']
注意事項:
* All XML Elements Must Have a Closing Tag
* XML Tags are Case Sensitive
* XML Attribute Values Must be Quoted
* XML Documents Must Have a Root Element
* entity reference
< < less than
> > greater than
& & ampersand
' ' apostrophe
" " quotation mark
i.e
xml='''\ <root> <child id="1"> <grandchildren id="1"/> <grandchildren id="2" sex="F"/> <grandchildren id="3" ages="14"/> </child> <child id="2">child2 text</child> <child id="3" ages="13" /> <child id="4" sex="M" >Peter</child> <child id="4" sex="F" >May</child> </root> '''
print root.find("child")
<Element child2 at 0x27d7348>
print root.find("child5")
None
>>> print root.find(".//child[@sex]").text
Peter
>>> print root.findall(".//child[@sex]")[0].get('sex'))
M
>>> print(root.findall(".//a[@y]"))
[]
>>> print(root.find(".//b").tag)
b
print root.iterfind(".//child[@sex]")
<generator object select at 0x0000000002A2F048>
for c in root.iterfind(".//child[@sex]"):
print c.text
Doc