一、前言

初学python,看《python基础教程》,第20章实现了将文本转化成html的功能。由于本人之前有DIY一个markdown转html的算法,所以对这个例子有兴趣。可仔细一看,发现很难看懂,一个功能分散在几个文件中,各个类的耦合非常紧。虽然自己有几年的c++开发经验,但初看这个python代码也觉得头晕。

二、原版

以下是其源码

 from __future__ import generators

 def lines(file):
for line in file:
yield line
yield '\n' def blocks(file):
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []

util.py

# This Python file uses the following encoding: utf-8
class Rule:
"""
Base class for all rules.
"""
def action(self, block, handler):
handler.start(self.type)
handler.feed(block)
handler.end(self.type)
return True class HeadingRule(Rule):
"""
A heading is a single line that is at most 70 characters and
that doesn't end with a colon.
"""
type = 'heading' def condition(self, block):
return '\n' not in block and len(block) <= 70 and not block[-1] == ':' class TitleRule(HeadingRule):
"""
The title is the first block in the document, provided that it is
a heading.
"""
type = 'title'
first = True def condition(self, block):
if not self.first:
return False
self.first = False
return HeadingRule.condition(self, block) class ListItemRule(Rule):
"""
A list item is a paragraph that begins with a hyphen. As part of
the formatting, the hyphen is removed.
"""
type = 'listitem' def condition(self, block):
return block[0] == '-' def action(self, block, handler):
handler.start(self.type)
handler.feed(block[1:].strip())
handler.end(self.type)
return 1 # start ListRule {
class ListRule(ListItemRule):
"""
A list begins between a block that is not a list item and a
subsequent list item. It ends after the last consecutive list
item.
"""
type = 'list'
inside = False def condition(self, block):
# 总返回true,因为对每个block都得进行检查
return True def action(self, block, handler):
if not self.inside and ListItemRule.condition(self, block):
handler.start(self.type)
self.inside = True
elif self.inside and not ListItemRule.condition(self, block):
handler.end(self.type)
self.inside = False
# 总返回false,因为得让规则继续处理
return False
# end ListRule } class ParagraphRule(Rule):
"""
A paragraph is simply a block that isn't covered by any of the
other rules.
"""
type = 'paragraph' def condition(self, block):
return True

rules.py

 # start Handler {
class Handler:
"""
An object that handles method calls from the Parser. The Parser will call the start() and end() methods at the
beginning of each block, with the proper block name as
parameter. The sub() method will be used in regular expression
substitution. When called with a name such as 'emphasis', it will
return a proper substitution function.
"""
def callback(self, prefix, name, *args):
method = getattr(self, prefix+name, None)
if callable(method):
return method(*args) def start(self, name):
self.callback('start_', name) def end(self, name):
self.callback('end_', name) def sub(self, name):
return lambda match: \
self.callback('sub_', name, match) or match.group(0)
# end Handler } # start HTMLHandler {
class HTMLHandler(Handler):
"""
A specific handler used for rendering HTML. The methods in HTMLHandler are accessed from the superclass
Handler's start(), end(), and sub() methods. They implement basic
markup as used in HTML documents.
"""
def start_document(self):
print '<html><head><title>...</title></head><body>' def end_document(self):
print '</body></html>' def start_paragraph(self):
print '<p>' def end_paragraph(self):
print '</p>' def start_title(self):
print '<h1>' def end_title(self):
print '</h1>' def start_heading(self):
print '<h2>' def end_heading(self):
print '</h2>' def start_list(self):
print '<ul>' def end_list(self):
print '</ul>' def start_listitem(self):
print '<li>' def end_listitem(self):
print '</li>' def sub_emphasis(self, match):
return '<em>%s</em>' % match.group(1) def sub_url(self, match):
return '<a href="%s">%s</a>' % (match.group(1), match.group(1)) def sub_mail(self, match):
return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1)) def feed(self, data):
print data # end HTMLHandler }

handles.py

 import sys
import re
from handlers import *
from util import *
from rules import * # start Parser {
class Parser:
"""
A Parser reads a text file, applying rules and controlling a
handler.
"""
def __init__(self, handler):
self.handler = handler
self.rules = []
self.filters = [] def addRule(self, rule):
self.rules.append(rule) def addFilter(self, pattern, name):
def filter(block, handler):
return re.sub(pattern, handler.sub(name), block)
self.filters.append(filter) def parse(self, file):
self.handler.start('document') for block in blocks(file):
for filter in self.filters:
block = filter(block, self.handler) for rule in self.rules:
if rule.condition(block):
last = rule.action(block, self.handler)
if last:
break
self.handler.end('document')
# end Parser } # start BaseTextParser {
class BasicTextParser(Parser):
"""
A specific Parser that adds rules and filters in its
constructor.
"""
def __init__(self, handler):
Parser.__init__(self, handler)
self.addRule(ListRule())
self.addRule(ListItemRule())
self.addRule(TitleRule())
self.addRule(HeadingRule())
self.addRule(ParagraphRule()) self.addFilter(r'\*(.+?)\*', 'emphasis')
self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')
# end BaseTextParser } handler = HTMLHandler()
parser = BasicTextParser(handler) parser.parse(sys.stdin)

markup.py

文本如下

Welcome to World Wide Spam, Inc.

These are the corporate web pages of *World Wide Spam*, Inc. We hope
you find your stay enjoyable, and that you will sample many of our
products. A short history of the company World Wide Spam was started in the summer of 2000. The business
concept was to ride the dot-com wave and to make money both through
bulk email and by selling canned meat online. After receiving several complaints from customers who weren't
satisfied by their bulk email, World Wide Spam altered their profile,
and focused 100% on canned goods. Today, they rank as the world's
13,892nd online supplier of SPAM. Destinations From this page you may visit several of our interesting web pages: - What is SPAM? (http://wwspam.fu/whatisspam) - How do they make it? (http://wwspam.fu/howtomakeit) - Why should I eat it? (http://wwspam.fu/whyeatit) How to get in touch with us You can get in touch with us in *many* ways: By phone (555-1234), by
email (wwspam@wwspam.fu) or by visiting our customer feedback page
(http://wwspam.fu/feedback).

test_input.txt

使用命令行  python markup.py < test_input.txt > out.html  即可将文件转化为有格式的html文件

上面代码有几点不足之处:

  1. rules.py代码和handles.py代码紧密耦合,rules.py,handles.py一起来实现根据规则来生成转化文本。rules.py中各种rule中定义了'heading', 'listitem'等,而handles.py中有各种start_headning(), end_heading()来响应对应的类型方法。
  2. 对文本中特殊格式的转化Filter功能分布中markup.py和handles.py中。markup.py 57-59行,中定义了匹配模式,而替换的方法又在handles.py 74-81行。
  3. ...

三、改进

下面是本人改进后的代码

 from __future__ import generators

 def lines(file):
for line in file:
yield line
yield '\n' def lines2(file):
for line in file:
s = line.strip()
if s:
yield s
yield '\n' def blocks(file):
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []

util.py

 import re

 def createFilter(pattern, fun):
def filter(line):
return re.sub(pattern, fun, line)
return filter def filterEm():
def subEm(match):
return '<em>%s</em>' % match.group(1)
return createFilter(r'\*(.+?)\*', subEm) def filterUrl():
def subUrl(match):
return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
return createFilter(r'(http://[\.a-zA-Z/]+)', subUrl) def filterMail():
def subMail(match):
return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
return createFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', subMail) def createFilters():
filters = []
filters.append(filterEm())
filters.append(filterUrl())
filters.append(filterMail())
return filters

filters.py

 # This Python file uses the following encoding: utf-8
class Rule:
def action(self, line):
self.start(line)
self.feed(line)
self.end(line)
return True def start(self, line):
pass def end(self, line):
pass def feed(self, line):
print line def endDoc(self):
pass class HeadingRule(Rule): # {{{
def condition(self, line):
return '\n' not in line and len(line) <= 30 and not line[-1] == ':' def start(self, line):
print '<h2>' def end(self, line):
print '</h2>' class TitleRule(HeadingRule):
first = True def condition(self, line):
if not self.first:
return False
self.first = False
return HeadingRule.condition(self, line) def start(self, line):
print '<h1>' def end(self, line):
print '</h1>' # }}} class ListItemRule(Rule): # {{{
def condition(self, line):
return line[0] == '-' def feed(self, line):
print line[1:].strip() def start(self, line):
print '<li>' def end(self, line):
print '</li>' class ListRule(ListItemRule):
inside = False
firstIn = False
firstOut = False def condition(self, line):
return True def action(self, line):
if not self.inside and ListItemRule.condition(self, line):
self.start(line)
self.inside = True
elif self.inside and not ListItemRule.condition(self, line):
self.end(line)
self.inside = False
return False def start(self, line):
print '<ul>' def end(self, line):
print '</ul>' def feed(self, line):
pass # }}} class ParagraphRule(Rule): def condition(self, line):
return True def start(self, line):
print '<p>' def end(self, line):
print '</p>' class DocumentRule(Rule):
first = True
isStart = False def condition(self, line):
if self.first:
self.first = False
self.isStart = True
return True
return False def action(self, line):
if self.isStart:
self.start(line)
self.isStart = False
return False def start(self, line):
print '<html><head><title>...</title></head><body>' def end(self, line):
print '</body></html>' def endDoc(self):
self.end('')

rules.py

 # This Python file uses the following encoding: utf-8
from util import *
from rules import *
import re
import sys class MyParser:
def __init__(self):
self.rules = []
self.filters = [] def addRule(self, rule):
self.rules.append(rule) def setFilters(self, filters):
self.filters = filters def parse(self, file):
for line in lines2(file): for filter in self.filters:
line = filter(line) for rule in self.rules:
if rule.condition(line):
last = rule.action(line)
if last:
break # 文档结束后调用,以处理收尾工作
for rule in self.rules:
rule.endDoc()

parsers.py

 from parsers import *
from util import *
from rules import *
from filters import *
import sys p = MyParser()
p.addRule(DocumentRule())
p.addRule(ListRule())
p.addRule(ListItemRule())
p.addRule(TitleRule())
p.addRule(HeadingRule())
p.addRule(ParagraphRule())
p.setFilters(createFilters()) p.parse(sys.stdin)

main.py

使用命令  python main.py < test_input.txt > out.html  运行

有如下几点改动:

  1. rules和handles功能合在一起都放在rules.py中实现。
  2. 将Filter都放在filters.py中,并且可以看到匹配模式和替换函数写在一起,文本过滤这个功能容易一眼就看出如何实现。
  3. 添加了一个DocumentRule规则用来处理文档的开始和结束。并且在parsers.py 32行 循环调用每个rule类的endDoc()用以文档结束时的处理。当然现在只有DocumentRule类才会响应这个调用
  4. util.py 中用添加lines2()函数,并且在parsers.py中使用这个函数来读取文本行

最后,代码应该写得容易让人看得懂  (尤其是在一本初始教程中)。

ps: 本人接下来将用上面的框架用python写个markdown转html的算法,然后再将代码转化成c++代码。最后完善自己的笔记软件并且用Qt写个跨windows/mac平台的markdown的编辑器。

改写《python基础教程》中的一个例子的更多相关文章

  1. Python 基础教程中的问题及解决方案(1)

    1. 在ubuntu中,调用终端时如: f = open('/home/theone/test_input.txt', 'r') 中的txt格式文本不能加后缀 正确的应为:  f = open('/h ...

  2. Python 基础教程 —— 网络爬虫入门篇

    前言 Python 是一种解释型.面向对象.动态数据类型的高级程序设计语言,它由 Guido van Rossum 于 1989 年底发明,第一个公开发行版发行于 1991 年.自面世以后,Pytho ...

  3. (Python基础教程之十三)Python中使用httplib2 – HTTP GET和POST示例

    Python基础教程 在SublimeEditor中配置Python环境 Python代码中添加注释 Python中的变量的使用 Python中的数据类型 Python中的关键字 Python字符串操 ...

  4. (Python基础教程之八)Python中的list操作

    Python基础教程 在SublimeEditor中配置Python环境 Python代码中添加注释 Python中的变量的使用 Python中的数据类型 Python中的关键字 Python字符串操 ...

  5. python基础教程笔记—即时标记(详解)

    最近一直在学习python,语法部分差不多看完了,想写一写python基础教程后面的第一个项目.因为我在网上看到的别人的博客讲解都并不是特别详细,仅仅是贴一下代码,书上内容照搬一下,对于当时刚学习py ...

  6. Python基础教程总结(一)

    引言: 一直都听说Python很强大,以前只是浏览了一些博客,发现有点像数学建模时使用的Matlab,就没有深入去了解了.如今Python使用的地方越来越多,最近又在学习机器学习方面的知识,因此想系统 ...

  7. Python基础教程-02

    <Python基础教程> 第3章 使用字符串 字符串方法find返回的并非布尔值.如果find像这样返回0,就意味着它在索引0处找到 了指定的子串 join可合并一个字符串列表,不能合并数 ...

  8. Python 基础教程 —— Pandas 库常用方法实例说明

    目录 1. 常用方法 pandas.Series 2. pandas.DataFrame ([data],[index])   根据行建立数据 3. pandas.DataFrame ({dic})  ...

  9. .Net程序员之Python基础教程学习----列表和元组 [First Day]

    一. 通用序列操作: 其实对于列表,元组 都属于序列化数据,可以通过下表来访问的.下面就来看看序列的基本操作吧. 1.1 索引: 序列中的所有元素的下标是从0开始递增的. 如果索引的长度的是N,那么所 ...

随机推荐

  1. 兼容各版本浏览器,封装原生Js获取ClassName

    web前端开发工作中常常会用到获取元素的className,用jQuery的$(".class")方法也可以获取className,但是有时候牵扯到数据而影响的加载顺序的原因会获取 ...

  2. $("").click与onclick的区别

    onclick是绑定事件,click本身是方法作用是触发onclick事件,只要执行了元素的click()方法,下面示例 Html代码 ? 1 2 3 4 5 6 7 8 9 10 11 12 13 ...

  3. centos6虚拟机复制后修改网卡

    方法1: 使用vmware创建centos6.4虚拟机, 创建完成后复制该虚拟机, 打开复制的虚拟机发现网卡名字是eth1,而网卡配置文件为eth0,mac地址变了 这时修改网卡配置文件, 删除uui ...

  4. htop基本使用

    一.什么是htop? top是所有类unix系统的必备工具,能直观方便的查看到系统负载.内存及进程等信息. 而htop具有top工具的全部功能且还新增了一些额外的功能和使用体验改进.与top相比,其具 ...

  5. 关于反射率(reflectance)

    首先,BRDF的内容因为见的多,用的多,所以比较容易理解.但是由BRDF引申出来的反射率,跟BRDF比不太常见,有些东西反而不易理解.尤其是组里的某大牛都不甚清楚(说明这个问题不太容易或者太过冷门), ...

  6. Prototype之个人见解

    prototype js 的对象比较 由于 js 是解释执行的语言, 那么再代码中出现函数与对象如果重复执行, 会创建多个副本 在代码中重复执行的代码容易出现重复的对象 创建一个 Person 构造函 ...

  7. WCF 部署在Windows 2012 IIS上各种报错的解决方法

    1.由于扩展配置问题而无法提供您请求的页面.如果该页面是脚本 ,请添加处理程序.如果勇载文件,请添加 MIME 映射. 以管理员身份,在cmd中运行C:\Windows\Microsoft.NET\F ...

  8. capwap协议重点分析

    一.     CAPWAP概述 CAPWAP由两个部分组成:CAPWAP协议和无线BINDING协议. (1)CAPWAP协议是一个通用的隧道协议,完成AP发现AC等基本协议功能,和具体的无线接入技术 ...

  9. CentOS7 监控进程网络流量工具安装

    服务器在做测试的时候,需要监控网络流量,用来了解在不同人数的时候服务器的网络使用量. 我们使用服务器环境是centos7,centos下通常使用iftop,或者nethogs来进行网络流量监控.这2个 ...

  10. maven - Eclipse构建maven项目

    前面的博文已经介绍了如何安装maven,本文将记录如何在Eclipse下构建maven项目. 一.Eclipse maven插件安装 关于安装Eclipse maven插件,网上有很多方法,这里推荐一 ...