Watermark Removal on PDF with PyPDF2
This Section imports the necessary classes from the PyPDF2 library
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | from PyPDF2 import PdfFileReader, PdfFileWriter from PyPDF2.pdf import ContentStream from PyPDF2.generic import TextStringObject, NameObject from PyPDF2.utils import b_ >The watermark says SAMPLE on it so I've tried different capitalization cases wm_text = 'Sample' replace_with = '' >I'm hoping to just replace the SAMPLE watermark with nothing so a space could suffice > Load PDF into pyPDF source = PdfFileReader(open('input.pdf',"rb")) output = PdfFileWriter() > For each page for page in range(source.getNumPages()): # Get the current page and it's contents page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) > Loop over all pdf elements for operands, operator in content.operations: |
Was told to adapt this part dependent on my PDF file
1 2 3 4 | if operator == b_("TJ"): text = operands[0][0] if isinstance(text, TextStringObject) and text.startswith(wm_text): operands[0] = TextStringObject(replace_with) |
Set the modified content as content object on the page
1 | page.__setitem__(NameObject('/Contents'), content) |
Add the page to the output
1 | output.addPage(page) |
写流
outputStream = open(" output.pdf"," wb")
output.write(outputStream)
在这里使用问题中的代码是在Python 3中可以使用的函数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | def remove_watermark(wm_text, inputFile, outputFile): from PyPDF4 import PdfFileReader, PdfFileWriter from PyPDF4.pdf import ContentStream from PyPDF4.generic import TextStringObject, NameObject from PyPDF4.utils import b_ with open(inputFile,"rb") as f: source = PdfFileReader(f,"rb") output = PdfFileWriter() for page in range(source.getNumPages()): page = source.getPage(page) content_object = page["/Contents"].getObject() content = ContentStream(content_object, source) for operands, operator in content.operations: if operator == b_("Tj"): text = operands[0] if isinstance(text, str) and text.startswith(wm_text): operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) output.addPage(page) with open(outputFile,"wb") as outputStream: output.write(outputStream) wm_text = 'wm_text' inputFile = r'input.pdf' outputFile = r"output.pdf" remove_watermark(wm_text, inputFile, outputFile) |