python pdf
admin
2023-07-23 19:42:36
0
# 从pdf中读取文本
# 写pdf
# 加密解密pdf
# 和平pdf,加水印
# pip install PyPDF2
%cd D:\python全站\office
import PyPDF2
D:\python全站\office
pdf_obj = open('coop.pdf', 'rb')
pdf = PyPDF2.PdfFileReader(pdf_obj)
pdf.numPages
3
page = pdf.getPage(0)
page.extractText()  # 提取文件
'\n\n \n \n1\\\n1\nN¥\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n'
# 提取中文 pip install pdfminer3k  #支持中文
from pdfminer.pdfinterp import PDFResourceManager, process_pdf # 资源管理
from pdfminer.converter import TextConverter  # 文本转换
from pdfminer.layout import LAParams #布局
from io import StringIO  # 生成临时文件

def convert_pdf(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams = laparams)
    fp = open(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()
    out = retstr.getvalue()
    retstr.close()
    return out
s = convert_pdf('coop.pdf')
# print(s)
# convert_pdf('coop.pdf')
s.split('\n\x0c')
['测试语句 \n\n第 1 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ',
 '测试语句 \n\n第 2 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ',
 'de8ug word \n\n测试语句 \n\n第 3 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n ',
 '']
# 写pdf,从上文打开的pdf找出第二页,新鞋一个pdf
pdf_writer = PyPDF2.PdfFileWriter()
page = pdf.getPage(1)
pdf_writer.addPage(page)
with open('coop-1.pdf', 'wb') as f:
    pdf_writer.write(f)
pdf_obj.close()
# 加密pdf
with open('coop.pdf', 'rb') as f_in:
    pdf = PyPDF2.PdfFileReader(f_in)
    pdf_writer = PyPDF2.PdfFileWriter()
    for page_num in range(pdf.numPages):
        pdf_writer.addPage(pdf.getPage(page_num))
    pdf_writer.encrypt('hicoop')
    with open('coop-s.pdf', 'wb') as f_out:
        pdf_writer.write(f_out)
# 解密
with open('coop-s.pdf', 'rb') as f_in:
    pdf = PyPDF2.PdfFileReader(f_in)
    print(pdf.isEncrypted)
    pdf.decrypt('hicoop')
    pdf.getPage(0) #取到解密后的数据才能正常操作
True
# 合并多个pdf,加水印
with open('coop.pdf', 'rb') as f_in:
    with open('coop-watermarked.pdf', 'rb') as f_w:
        pdf = PyPDF2.PdfFileReader(f_in)
        pdf_w = PyPDF2.PdfFileReader(f_w)

        pdf_write = PyPDF2.PdfFileWriter()
        for page_num in range(pdf.numPages):
            page = pdf.getPage(page_num)
            page.mergePage(pdf_w.getPage(0))
            pdf_write.addPage(page)
        with open('coop-watermarked.pdf', 'wb') as f_out:
            pdf_write.write(f_out)
---------------------------------------------------------------------------

OSError                                   Traceback (most recent call last)

 in ()
      3     with open('coop-watermarked.pdf', 'rb') as f_w:
      4         pdf = PyPDF2.PdfFileReader(f_in)
----> 5         pdf_w = PyPDF2.PdfFileReader(f_w)
      6 
      7         pdf_write = PyPDF2.PdfFileWriter()

c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in __init__(self, stream, strict, warndest, overwriteWarnings)
   1082             stream = BytesIO(b_(fileobj.read()))
   1083             fileobj.close()
-> 1084         self.read(stream)
   1085         self.stream = stream
   1086 

c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in read(self, stream)
   1687         if debug: print(">>read", stream)
   1688         # start at the end:
-> 1689         stream.seek(-1, 2)
   1690         if not stream.tell():
   1691             raise utils.PdfReadError('Cannot read an empty file')

OSError: [Errno 22] Invalid argument

相关内容

热门资讯

国防部:民进党当局蓄意制造紧张... 6月25日下午,国防部举行例行记者会,国防部新闻发言人张晓刚大校答记者问。记者:美日正在开展“勇敢之...
顶级阴阳!韩记者提问韩国队主帅... 2026年美加墨世界杯A组第三轮比赛中,韩国队在“打平即出线”的大好形势下,以0比1不敌南非队,场上...
苹果iOS 27版日历更新汇总... 6 月 25 日消息,科技媒体 9to5Mac 昨日(6 月 24 日)发布博文,报道称在 iOS ...
董子健喊“我很空”,韩红喊“走... 图片为AI生成 出品|搜狐科技 作者|张莹 6月17日,北京,《抓特务》首映礼。影片配乐制作人韩红用...
海淀又现全球首次,银河通用机器... 记者6月25日从海淀区获悉,银河通用Galbot S1具身智能重载机器人目前已在宁德时代量产线上7×...
新质策源导刊丨格蓝若:让人形机... 具身智能:万亿赛道的 “价值兑现”之考(上) 编者按 资本潮涌下,具身智能正在叩响“价值兑现”的大门...
微软终于修复Win11蓝牙顽疾... IT之家 6 月 25 日消息,据 Windows Latest 报道,微软刚刚推出迄今为止规模最大...
车企不“造人”,就出局? 来源 | 伯虎财经(bohuFN) 作者 | 楷楷 2026年的全球车圈,都将目光投向了一个全新的赛...
巴拿马船舶在中国港口遭针对?外... 澎湃新闻记者 杨文钦 于潇清6月25日,外交部发言人郭嘉昆主持例行记者会。有记者提问,据报道,巴拿马...
外交部:委内瑞拉强震暂无中国公... 6月25日,外交部发言人郭嘉昆主持例行记者会。在回答有关委内瑞拉地震中有无中国公民伤亡的问题时,郭嘉...