Remove watermark with PyPDF2

2018年06月19日

You may find the watermark of PDF file very annoying and it usually very hard to remove. With the power of python package PyPDF2 it would be done with some research.

PyPDF2 can analysis the source PDF file and identify all elements, in which some may correspond to the watermark. To be noticed, the watermark can be composed of several objects.

Like following code, you can analysis the structure of PDF file to identify the watermark objects.

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject, NameObject
from PyPDF2.utils import b_
import numpy as np

file = open('/tmp/深度学习+中文版.pdf', "rb")

output = PdfFileWriter()
source = PdfFileReader(file)

print(source.getDocumentInfo())


def guess_codes(s):
    avaiable_c = []

    codes = 'ascii,big5,big5hkscs,cp037,cp273,cp424,cp437,cp500,cp720,cp737,cp775,cp850,cp852,cp855,cp856,cp857,cp858,cp860,cp861,cp862,cp863,cp864,cp865,cp866,cp869,cp874,cp875,cp932,cp949,cp950,cp1006,cp1026,cp1125,cp1140,cp1250,cp1251,cp1252,cp1253,cp1254,cp1255,cp1256,cp1257,cp1258,cp65001,euc_jp,euc_jis_2004,euc_jisx0213,euc_kr,gb2312,gbk,gb18030,hz,iso2022_jp,iso2022_jp_1,iso2022_jp_2,iso2022_jp_2004,iso2022_jp_3,iso2022_jp_ext,iso2022_kr,latin_1,iso8859_2,iso8859_3,iso8859_4,iso8859_5,iso8859_6,iso8859_7,iso8859_8,iso8859_9,iso8859_10,iso8859_11,iso8859_13,iso8859_14,iso8859_15,iso8859_16,johab,koi8_r,koi8_t,koi8_u,kz1048,mac_cyrillic,mac_greek,mac_iceland,mac_latin2,mac_roman,mac_turkish,ptcp154,shift_jis,shift_jis_2004,shift_jisx0213,utf_32,utf_32_be,utf_32_le,utf_16,utf_16_be,utf_16_le,utf_7,utf_8,utf_8_sig'.split(
        ',')
    if not isinstance(s, bytes):
        for c in codes:
            try:
                s.decode(c)
                print(c)
                avaiable_c.append(c)
            except Exception:
                pass

    return avaiable_c



target_locations = np.array([
    [0.70711, 0.70711, -0.70711, 0.70711, -308.753, -165.279],
    [1, 0, 0, 1, -44.831, 54.605]])


def match_location(location, target, epsilon=1e-5):
    # targe must be n*6 numpy matrix
    return np.any(np.abs(np.array([i.as_numeric() for i in location]) - target).max(axis=1) < epsilon)


for p in range(source.getNumPages()):
    page = source.getPage(p)
    # print(page.extractText())
    content_object, = page["/Contents"].getObject()

    content = ContentStream(content_object, source)
    for operands, operator in content.operations:
        # print(operator, operands) # pdf元素的类型和值
	
	# 主要的代码在这里,使用各种方式找到水印可识别的特征
        # if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码
        #     text = operands[0][0]
        #     # if isinstance(text, bytes):
        #     #     print('====  ', text, '  ====')
        #     #     for c in guess_codes(text):
        #     #         print(c, text.decode(c))
        #     if isinstance(text, TextStringObject) and text in target_str:
        #         operands[0] = TextStringObject('')

        if operator == b_("cm") and match_location(operands, target_locations):
            operands[:] = []

    page.__setitem__(NameObject('/Contents'), content)
    output.addPage(page)


outputStream = open("/tmp/output.pdf", "wb")
output.write(outputStream)
outputStream.close()

file.close()

Some very simple watermarks may be just plain text. But some can be very complex. With the help of InkScape, one may find out the specific signeture of the watermark very easily. Import several pages into InkScape and use the Edit -> XML editor to view the structure of the pages.

Some svg node may have a transform matrix,use that to identity the PyPDF2 objects. Set the matrix of these object to empty will make the watermark disappear!

If you read pdf file into PdfFileReader and modifiy every page, then insert them into a PdfFileWriter you may loss the bookmark information, like what have been done in upper code. A better choice is to extent the PdfFileMerger class because it will automatically handle the bookmarks and other information. Some may suggest using the cloneDocumentFromReader method, but it may generate blank pages.

from PyPDF2 import PdfFileMerger
from PyPDF2.pdf import ContentStream
from PyPDF2.utils import b_
from PyPDF2.generic import NameObject
import numpy as np


class PdfFileModifyMerger(PdfFileMerger):
    def merge(self, *args, after_page_append=[], **kwargs):
        super(PdfFileModifyMerger, self).merge(*args, **kwargs)

        pdfr = self.inputs[-1][1]

        position = args[0]
        pages = kwargs.get('pages', None)
        if pages is None:
            pages = (0, pdfr.getNumPages())
        elif isinstance(pages, PageRange):
            pages = pages.indices(pdfr.getNumPages())
        elif not isinstance(pages, tuple):
            raise TypeError('"pages" must be a tuple of (start, stop[, step])')

        for mp in self.pages[position:(position+len(range(*pages)))]:
            for mf in after_page_append:
                mp.pagedata = mf(mp.pagedata, pdfr)


def cm_remover(locations, epsilon=1e-5):
    # locations must be n*6 numpy matrix
    def remove_cm(pg, pdfr):
        def matched_location(loc):
            return np.any(np.abs(np.array(loc) - locations).max(axis=1) < epsilon)
        content_object, = pg["/Contents"].getObject()

        content = ContentStream(content_object, pdfr)
        for operands, operator in content.operations:
            if operator == b_("cm") and matched_location([i.as_numeric() for i in operands]): # this is where magic happens
                operands[:] = []

        pg.__setitem__(NameObject('/Contents'), content)
        return pg

    return remove_cm


target_locations = np.array([ # find following information with InkScape, comparing the PyPDF2 outputs
    [0.70711, 0.70711, -0.70711, 0.70711, -308.753, -165.279],
    [1, 0, 0, 1, -44.831, 54.605]])


output = PdfFileModifyMerger()

output.merge(len(output.pages),
             '/tmp/深度学习+中文版.pdf',
             after_page_append=[cm_remover(target_locations)])

with open("/tmp/output.pdf", "wb") as outputStream:
    output.write(outputStream)