【Python】 xml转json

　　虽然python有解析xml的模块，也有生成json的模块，但是没有把这两者连接起来的模块。

　　下面是以来自MIT的大神Martin Blech写的一个方便的模块，供大家参考。也别忘了在用之前先拜谢作者三次ww

#!/usr/bin/env python

"Makes working with XML feel like you are working with JSON"

try:

    from defusedexpat import pyexpat as expat

except ImportError:

    from xml.parsers import expat

from xml.sax.saxutils import XMLGenerator

from xml.sax.xmlreader import AttributesImpl

try:  # pragma no cover

    from cStringIO import StringIO

except ImportError:  # pragma no cover

    try:

        from StringIO import StringIO

    except ImportError:

        from io import StringIO

try:  # pragma no cover

    from collections import OrderedDict

except ImportError:  # pragma no cover

    try:

        from ordereddict import OrderedDict

    except ImportError:

        OrderedDict = dict

try:  # pragma no cover

    _basestring = basestring

except NameError:  # pragma no cover

    _basestring = str

try:  # pragma no cover

    _unicode = unicode

except NameError:  # pragma no cover

    _unicode = str

__author__ = 'Martin Blech'

__version__ = '0.10.2'

__license__ = 'MIT'

class ParsingInterrupted(Exception):

    pass

class _DictSAXHandler(object):

    def __init__(self,

                 item_depth=0,

                 item_callback=lambda *args: True,

                 xml_attribs=True,

                 attr_prefix='@',

                 cdata_key='#text',

                 force_cdata=False,

                 cdata_separator='',

                 postprocessor=None,

                 dict_constructor=OrderedDict,

                 strip_whitespace=True,

                 namespace_separator=':',

                 namespaces=None,

                 force_list=None):

        self.path = []

        self.stack = []

        self.data = []

        self.item = None

        self.item_depth = item_depth

        self.xml_attribs = xml_attribs

        self.item_callback = item_callback

        self.attr_prefix = attr_prefix

        self.cdata_key = cdata_key

        self.force_cdata = force_cdata

        self.cdata_separator = cdata_separator

        self.postprocessor = postprocessor

        self.dict_constructor = dict_constructor

        self.strip_whitespace = strip_whitespace

        self.namespace_separator = namespace_separator

        self.namespaces = namespaces

        self.force_list = force_list

    def _build_name(self, full_name):

        if not self.namespaces:

            return full_name

        i = full_name.rfind(self.namespace_separator)

        if i == -1:

            return full_name

        namespace, name = full_name[:i], full_name[i+1:]

        short_namespace = self.namespaces.get(namespace, namespace)

        if not short_namespace:

            return name

        else:

            return self.namespace_separator.join((short_namespace, name))

    def _attrs_to_dict(self, attrs):

        if isinstance(attrs, dict):

            return attrs

        return self.dict_constructor(zip(attrs[0::2], attrs[1::2]))

    def startElement(self, full_name, attrs):

        name = self._build_name(full_name)

        attrs = self._attrs_to_dict(attrs)

        self.path.append((name, attrs or None))

        if len(self.path) > self.item_depth:

            self.stack.append((self.item, self.data))

            if self.xml_attribs:

                attr_entries = []

                for key, value in attrs.items():

                    key = self.attr_prefix+self._build_name(key)

                    if self.postprocessor:

                        entry = self.postprocessor(self.path, key, value)

                    else:

                        entry = (key, value)

                    if entry:

                        attr_entries.append(entry)

                attrs = self.dict_constructor(attr_entries)

            else:

                attrs = None

            self.item = attrs or None

            self.data = []

    def endElement(self, full_name):

        name = self._build_name(full_name)

        if len(self.path) == self.item_depth:

            item = self.item

            if item is None:

                item = (None if not self.data

                        else self.cdata_separator.join(self.data))

            should_continue = self.item_callback(self.path, item)

            if not should_continue:

                raise ParsingInterrupted()

        if len(self.stack):

            data = (None if not self.data

                    else self.cdata_separator.join(self.data))

            item = self.item

            self.item, self.data = self.stack.pop()

            if self.strip_whitespace and data:

                data = data.strip() or None

            if data and self.force_cdata and item is None:

                item = self.dict_constructor()

            if item is not None:

                if data:

                    self.push_data(item, self.cdata_key, data)

                self.item = self.push_data(self.item, name, item)

            else:

                self.item = self.push_data(self.item, name, data)

        else:

            self.item = None

            self.data = []

        self.path.pop()

    def characters(self, data):

        if not self.data:

            self.data = [data]

        else:

            self.data.append(data)

    def push_data(self, item, key, data):

        if self.postprocessor is not None:

            result = self.postprocessor(self.path, key, data)

            if result is None:

                return item

            key, data = result

        if item is None:

            item = self.dict_constructor()

        try:

            value = item[key]

            if isinstance(value, list):

                value.append(data)

            else:

                item[key] = [value, data]

        except KeyError:

            if self._should_force_list(key, data):

                item[key] = [data]

            else:

                item[key] = data

        return item

    def _should_force_list(self, key, value):

        if not self.force_list:

            return False

        try:

            return key in self.force_list

        except TypeError:

            return self.force_list(self.path[:-1], key, value)

def parse(xml_input, encoding=None, expat=expat, process_namespaces=False,

          namespace_separator=':', **kwargs):

    """Parse the given XML input and convert it into a dictionary.

    `xml_input` can either be a `string` or a file-like object.

    If `xml_attribs` is `True`, element attributes are put in the dictionary

    among regular child elements, using `@` as a prefix to avoid collisions. If

    set to `False`, they are just ignored.

    Simple example::

        >>> import xmltodict

        >>> doc = xmltodict.parse(\"\"\"

        ... <a prop="x">

        ...   <b>1</b>

        ...   <b>2</b>

        ... </a>

        ... \"\"\")

        >>> doc['a']['@prop']

        u'x'

        >>> doc['a']['b']

        [u'1', u'2']

    If `item_depth` is `0`, the function returns a dictionary for the root

    element (default behavior). Otherwise, it calls `item_callback` every time

    an item at the specified depth is found and returns `None` in the end

    (streaming mode).

    The callback function receives two parameters: the `path` from the document

    root to the item (name-attribs pairs), and the `item` (dict). If the

    callback's return value is false-ish, parsing will be stopped with the

    :class:`ParsingInterrupted` exception.

    Streaming example::

        >>> def handle(path, item):

        ...     print 'path:%s item:%s' % (path, item)

        ...     return True

        ...

        >>> xmltodict.parse(\"\"\"

        ... <a prop="x">

        ...   <b>1</b>

        ...   <b>2</b>

        ... </a>\"\"\", item_depth=2, item_callback=handle)

        path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:1

        path:[(u'a', {u'prop': u'x'}), (u'b', None)] item:2

    The optional argument `postprocessor` is a function that takes `path`,

    `key` and `value` as positional arguments and returns a new `(key, value)`

    pair where both `key` and `value` may have changed. Usage example::

        >>> def postprocessor(path, key, value):

        ...     try:

        ...         return key + ':int', int(value)

        ...     except (ValueError, TypeError):

        ...         return key, value

        >>> xmltodict.parse('<a><b>1</b><b>2</b><b>x</b></a>',

        ...                 postprocessor=postprocessor)

        OrderedDict([(u'a', OrderedDict([(u'b:int', [1, 2]), (u'b', u'x')]))])

    You can pass an alternate version of `expat` (such as `defusedexpat`) by

    using the `expat` parameter. E.g:

        >>> import defusedexpat

        >>> xmltodict.parse('<a>hello</a>', expat=defusedexpat.pyexpat)

        OrderedDict([(u'a', u'hello')])

    You can use the force_list argument to force lists to be created even

    when there is only a single child of a given level of hierarchy. The

    force_list argument is a tuple of keys. If the key for a given level

    of hierarchy is in the force_list argument, that level of hierarchy

    will have a list as a child (even if there is only one sub-element).

    The index_keys operation takes precendence over this. This is applied

    after any user-supplied postprocessor has already run.

        For example, given this input:

        <servers>

          <server>

            <name>host1</name>

            <os>Linux</os>

            <interfaces>

              <interface>

                <name>em0</name>

                <ip_address>10.0.0.1</ip_address>

              </interface>

            </interfaces>

          </server>

        </servers>

        If called with force_list=('interface',), it will produce

        this dictionary:

        {'servers':

          {'server':

            {'name': 'host1',

             'os': 'Linux'},

             'interfaces':

              {'interface':

                [ {'name': 'em0', 'ip_address': '10.0.0.1' } ] } } }

        `force_list` can also be a callable that receives `path`, `key` and

        `value`. This is helpful in cases where the logic that decides whether

        a list should be forced is more complex.

    """

    handler = _DictSAXHandler(namespace_separator=namespace_separator,

                              **kwargs)

    if isinstance(xml_input, _unicode):

        if not encoding:

            encoding = 'utf-8'

        xml_input = xml_input.encode(encoding)

    if not process_namespaces:

        namespace_separator = None

    parser = expat.ParserCreate(

        encoding,

        namespace_separator

    )

    try:

        parser.ordered_attributes = True

    except AttributeError:

        # Jython's expat does not support ordered_attributes

        pass

    parser.StartElementHandler = handler.startElement

    parser.EndElementHandler = handler.endElement

    parser.CharacterDataHandler = handler.characters

    parser.buffer_text = True

    try:

        parser.ParseFile(xml_input)

    except (TypeError, AttributeError):

        parser.Parse(xml_input, True)

    return handler.item

def _emit(key, value, content_handler,

          attr_prefix='@',

          cdata_key='#text',

          depth=0,

          preprocessor=None,

          pretty=False,

          newl='\n',

          indent='\t',

          full_document=True):

    if preprocessor is not None:

        result = preprocessor(key, value)

        if result is None:

            return

        key, value = result

    if (not hasattr(value, '__iter__')

            or isinstance(value, _basestring)

            or isinstance(value, dict)):

        value = [value]

    for index, v in enumerate(value):

        if full_document and depth == 0 and index > 0:

            raise ValueError('document with multiple roots')

        if v is None:

            v = OrderedDict()

        elif not isinstance(v, dict):

            v = _unicode(v)

        if isinstance(v, _basestring):

            v = OrderedDict(((cdata_key, v),))

        cdata = None

        attrs = OrderedDict()

        children = []

        for ik, iv in v.items():

            if ik == cdata_key:

                cdata = iv

                continue

            if ik.startswith(attr_prefix):

                if not isinstance(iv, _unicode):

                    iv = _unicode(iv)

                attrs[ik[len(attr_prefix):]] = iv

                continue

            children.append((ik, iv))

        if pretty:

            content_handler.ignorableWhitespace(depth * indent)

        content_handler.startElement(key, AttributesImpl(attrs))

        if pretty and children:

            content_handler.ignorableWhitespace(newl)

        for child_key, child_value in children:

            _emit(child_key, child_value, content_handler,

                  attr_prefix, cdata_key, depth+1, preprocessor,

                  pretty, newl, indent)

        if cdata is not None:

            content_handler.characters(cdata)

        if pretty and children:

            content_handler.ignorableWhitespace(depth * indent)

        content_handler.endElement(key)

        if pretty and depth:

            content_handler.ignorableWhitespace(newl)

def unparse(input_dict, output=None, encoding='utf-8', full_document=True,

            **kwargs):

    """Emit an XML document for the given `input_dict` (reverse of `parse`).

    The resulting XML document is returned as a string, but if `output` (a

    file-like object) is specified, it is written there instead.

    Dictionary keys prefixed with `attr_prefix` (default=`'@'`) are interpreted

    as XML node attributes, whereas keys equal to `cdata_key`

    (default=`'#text'`) are treated as character data.

    The `pretty` parameter (default=`False`) enables pretty-printing. In this

    mode, lines are terminated with `'\n'` and indented with `'\t'`, but this

    can be customized with the `newl` and `indent` parameters.

    """

    if full_document and len(input_dict) != 1:

        raise ValueError('Document must have exactly one root.')

    must_return = False

    if output is None:

        output = StringIO()

        must_return = True

    content_handler = XMLGenerator(output, encoding)

    if full_document:

        content_handler.startDocument()

    for key, value in input_dict.items():

        _emit(key, value, content_handler, full_document=full_document,

              **kwargs)

    if full_document:

        content_handler.endDocument()

    if must_return:

        value = output.getvalue()

        try:  # pragma no cover

            value = value.decode(encoding)

        except AttributeError:  # pragma no cover

            pass

        return value

if __name__ == '__main__':  # pragma: no cover

    import sys

    import marshal

    try:

        stdin = sys.stdin.buffer

        stdout = sys.stdout.buffer

    except AttributeError:

        stdin = sys.stdin

        stdout = sys.stdout

    (item_depth,) = sys.argv[1:]

    item_depth = int(item_depth)

    def handle_item(path, item):

        marshal.dump((path, item), stdout)

        return True

    try:

        root = parse(stdin,

                     item_depth=item_depth,

                     item_callback=handle_item,

                     dict_constructor=dict)

        if item_depth == 0:

            handle_item([], root)

    except KeyboardInterrupt:

        pass

【Python】 xml转json的更多相关文章

Python: xml转json
1,引言 GooSeeker早在9年前就开始了Semantic Web领域的产品化,MS谋数台和DS打数机是其中两个产品.对web内容做结构化转换和语义处理的主要路线是 XML -> RDF - ...
Python 解析构建数据大杂烩 -- csv、xml、json、excel
Python 可以通过各种库去解析我们常见的数据.其中 csv 文件以纯文本形式存储表格数据,以某字符作为分隔值,通常为逗号:xml 可拓展标记语言,很像超文本标记语言 Html ,但主要对文档和数据 ...
Python解析xml与JSON
xml与json是常用的文件交换格式,常用来表示网页的html则是xml的变种.解析xml和json在web开发中有着重要应用. DOM解析XML 文件对象模型(Document Object Mod ...
python入门（十）：XML和JSON解析
一.python解析XML 1.xml.dom.*模块,它是W3C DOM API的实现,若需要处理DOM API则该模块很适合,注意xml.dom包里面有许多模块,须区分它们间的不同: 2.xml. ...
python cookbook第三版学习笔记七：python解析csv,json,xml文件
CSV文件读取: Csv文件格式如下:分别有2行三列. 访问代码如下: f=open(r'E:\py_prj\test.csv','rb') f_csv=csv.reader(f) for f in ...
[Network] HTML、XML和JSON学习汇总
写在前面:楼主也是刚刚接触这方面的知识,之前完全是零基础,后来经朋友推荐了几个不错的博文,看完以后豁然开朗.但是此博文更加偏重于基础知识介绍(其实更深的楼主也还不了解,这方面的大神请绕道),只是分享个 ...
xml和json的区别
本文转自SanMaoSpace的博客链接地址如下:http://www.cnblogs.com/SanMaoSpace/p/3139186.html 1.定义介绍 (1).XML定义扩展标记语言 ( ...
数据解析（XML和JSON数据结构）
一解析二 XML数据结构三 JSON 数据结构一解析 1 定义: 从事先规定好的格式中提取数据解析的前提:提前约定好格式,数据提供方按照格式提供数据.数据获取方则按照 ...
【原】iOS学习之XML与JSON两种数据结构比较和各自底层实现
1.XML与JSON两种数据结构的优缺点 1> XML 优点:  格式统一, 符合标准  容易与其他系统进行远程交互, 数据共享比较方便   缺点: XML文件格式文件庞大, 格式复杂, 传输占 ...
XML与JSON的对比
XML与JSON的对比 1.各自定义 XML 扩展标记语言 (Extensible Markup Language, XML) ,用于标记电子文件使其具有结构性的标记语言,可以用来标记数据.定义数据类 ...

随机推荐

(28000): Access denied for user 'root'@'127.0.0.1' (using password: YES)
在一台测试服务器测试Python脚本时,执行Python脚本时报如下错误: 主要错误信息为"operation the sql fail!1045 (28000): Access den ...
U-boot-1.1.4中关于hello_world.srec出错
make[1]: *** No rule to make target `hello_world.srec', needed by `all'. Stop. make[1]: Leaving dir ...
Java Web项目（Extjs）报错五
1. Java Web项目(Extjs)报错五具体报错如下: usage: java org.apache.catalina.startup.Catalina [ -config {pathname ...
jquery绑定onkeyup()事件3中方法
$('input').keyup(function () { ... }); $('input').bind('keyup', function () { ... }); $('input').liv ...
异常-----The superclass "javax.servlet.http.HttpServlet" was not found on the Java Build Path。
1, 找到新建页面所在的工程名字,然后左键选中,右键弹出功能菜单,选择Build Path,进入配置路径. 2, 在java build path 页面的下选择Libraries栏目(默认选择),点击 ...
安卓中webview读取html，同时嵌入Flex的SWF，交互
安卓中webview读取html,同时嵌入Flex的SWF,交互安卓activity与html交互很简单,用javascript接口即可,网上一堆的例子,基本上没多大问题. 在html里面嵌入swf ...
hihocoder #1456 : Rikka with Lattice(杜教筛)
hihocoder #1456 : Rikka with Lattice(杜教筛) 题意 : 给你一个$n*m$方格图,统计上面有多少个格点三角形,除了三个顶点,不覆盖其他的格点(包括边和内部). ...
BZOJ 1926: [Sdoi2010]粟粟的书架(主席树,二分答案)
BZOJ 1926: [Sdoi2010]粟粟的书架(主席树,二分答案) 题意 : 给你一个长为$R$宽为$C$的矩阵,第$i$行$j$列的数为$P_{i,j}$. 有$m$次 ...
LightOJ1282 Leading and Trailing
题面给定两个数n,k 求n^k的前三位和最后三位 Input Input starts with an integer T (≤ 1000), denoting the number of test ...
[THUWC 2017]在美妙的数学王国中畅游
bzoj5020 \[答案误差只要小于 10^{-7}\] 题解 Taylor展开式: \[若f(x)的n阶导数在[a, b]内连续,则f(x)在x_{0}\in[a, b]可表示为\] \[f(x) ...

【Python】 xml转json

【Python】 xml转json的更多相关文章

随机推荐

热门专题