关于python：使用PyPDF2在PDF上去除水印

Watermark Removal on PDF with PyPDF2

This Section imports the necessary classes from the PyPDF2 library

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.utils import b_

>The watermark says SAMPLE on it so I've tried different capitalization cases
wm_text = 'Sample'
replace_with = ''
>I'm hoping to just replace the SAMPLE watermark with nothing so a space could suffice

> Load PDF into pyPDF
source = PdfFileReader(open('input.pdf',"rb"))
output = PdfFileWriter()

> For each page
for page in range(source.getNumPages()):
# Get the current page and it's contents
page = source.getPage(page)
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, source)

> Loop over all pdf elements
for operands, operator in content.operations:

Was told to adapt this part dependent on my PDF file

1
2
3
4

if operator == b_("TJ"):
text = operands[0][0]
if isinstance(text, TextStringObject) and text.startswith(wm_text):
operands[0] = TextStringObject(replace_with)

Set the modified content as content object on the page

1	page.__setitem__(NameObject('/Contents'), content)

Add the page to the output

1	output.addPage(page)

写流
outputStream = open(" output.pdf"，" wb")
output.write(outputStream)

在这里使用问题中的代码是在Python 3中可以使用的函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

def remove_watermark(wm_text, inputFile, outputFile):
from PyPDF4 import PdfFileReader, PdfFileWriter
from PyPDF4.pdf import ContentStream
from PyPDF4.generic import TextStringObject, NameObject
from PyPDF4.utils import b_

with open(inputFile,"rb") as f:
source = PdfFileReader(f,"rb")
output = PdfFileWriter()

for page in range(source.getNumPages()):
page = source.getPage(page)
content_object = page["/Contents"].getObject()
content = ContentStream(content_object, source)

for operands, operator in content.operations:
if operator == b_("Tj"):
text = operands[0]

if isinstance(text, str) and text.startswith(wm_text):
operands[0] = TextStringObject('')

page.__setitem__(NameObject('/Contents'), content)
output.addPage(page)

with open(outputFile,"wb") as outputStream:
output.write(outputStream)

wm_text = 'wm_text'
inputFile = r'input.pdf'
outputFile = r"output.pdf"
remove_watermark(wm_text, inputFile, outputFile)