You may find the watermark of PDF file very annoying and it usually very hard to remove. With the power of python package PyPDF2
it would be done with some research.
PyPDF2 can analysis the source PDF file and identify all elements, in which some may correspond to the watermark. To be noticed, the watermark can be composed of several objects
.
Like following code, you can analysis the structure of PDF file to identify the watermark objects.
from PyPDF2 import PdfFileReader , PdfFileWriter
from PyPDF2.pdf import ContentStream
from PyPDF2.generic import TextStringObject , NameObject
from PyPDF2.utils import b_
import numpy as np
file = open ( '/tmp/深度学习+中文版.pdf' , "rb" )
output = PdfFileWriter ()
source = PdfFileReader ( file )
print ( source . getDocumentInfo ())
def guess_codes ( s ):
avaiable_c = []
codes = 'ascii,big5,big5hkscs,cp037,cp273,cp424,cp437,cp500,cp720,cp737,cp775,cp850,cp852,cp855,cp856,cp857,cp858,cp860,cp861,cp862,cp863,cp864,cp865,cp866,cp869,cp874,cp875,cp932,cp949,cp950,cp1006,cp1026,cp1125,cp1140,cp1250,cp1251,cp1252,cp1253,cp1254,cp1255,cp1256,cp1257,cp1258,cp65001,euc_jp,euc_jis_2004,euc_jisx0213,euc_kr,gb2312,gbk,gb18030,hz,iso2022_jp,iso2022_jp_1,iso2022_jp_2,iso2022_jp_2004,iso2022_jp_3,iso2022_jp_ext,iso2022_kr,latin_1,iso8859_2,iso8859_3,iso8859_4,iso8859_5,iso8859_6,iso8859_7,iso8859_8,iso8859_9,iso8859_10,iso8859_11,iso8859_13,iso8859_14,iso8859_15,iso8859_16,johab,koi8_r,koi8_t,koi8_u,kz1048,mac_cyrillic,mac_greek,mac_iceland,mac_latin2,mac_roman,mac_turkish,ptcp154,shift_jis,shift_jis_2004,shift_jisx0213,utf_32,utf_32_be,utf_32_le,utf_16,utf_16_be,utf_16_le,utf_7,utf_8,utf_8_sig' . split (
',' )
if not isinstance ( s , bytes ):
for c in codes :
try :
s . decode ( c )
print ( c )
avaiable_c . append ( c )
except Exception :
pass
return avaiable_c
target_locations = np . array ([
[ 0.70711 , 0.70711 , - 0.70711 , 0.70711 , - 308.753 , - 165.279 ],
[ 1 , 0 , 0 , 1 , - 44.831 , 54.605 ]])
def match_location ( location , target , epsilon = 1e-5 ):
# targe must be n*6 numpy matrix
return np . any ( np . abs ( np . array ([ i . as_numeric () for i in location ]) - target ). max ( axis = 1 ) < epsilon )
for p in range ( source . getNumPages ()):
page = source . getPage ( p )
# print(page.extractText())
content_object , = page [ "/Contents" ]. getObject ()
content = ContentStream ( content_object , source )
for operands , operator in content . operations :
# print(operator, operands) # pdf元素的类型和值
# 主要的代码在这里,使用各种方式找到水印可识别的特征
# if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码
# text = operands[0][0]
# # if isinstance(text, bytes):
# # print('==== ', text, ' ====')
# # for c in guess_codes(text):
# # print(c, text.decode(c))
# if isinstance(text, TextStringObject) and text in target_str:
# operands[0] = TextStringObject('')
if operator == b_ ( "cm" ) and match_location ( operands , target_locations ):
operands [:] = []
page . __setitem__ ( NameObject ( '/Contents' ), content )
output . addPage ( page )
outputStream = open ( "/tmp/output.pdf" , "wb" )
output . write ( outputStream )
outputStream . close ()
file . close ()
Some very simple watermarks may be just plain text. But some can be very complex. With the help of InkScape, one may find out the specific signeture of the watermark very easily. Import several pages into InkScape and use the Edit -> XML editor
to view the structure of the pages.
Some svg node may have a transform matrix
,use that to identity the PyPDF2 objects. Set the matrix of these object to empty will make the watermark disappear!
If you read pdf file into PdfFileReader
and modifiy every page, then insert them into a PdfFileWriter
you may loss the bookmark information, like what have been done in upper code. A better choice is to extent the PdfFileMerger
class because it will automatically handle the bookmarks and other information. Some may suggest using the cloneDocumentFromReader
method, but it may generate blank pages.
from PyPDF2 import PdfFileMerger
from PyPDF2.pdf import ContentStream
from PyPDF2.utils import b_
from PyPDF2.generic import NameObject
import numpy as np
class PdfFileModifyMerger ( PdfFileMerger ):
def merge ( self , * args , after_page_append = [], ** kwargs ):
super ( PdfFileModifyMerger , self ). merge ( * args , ** kwargs )
pdfr = self . inputs [ - 1 ][ 1 ]
position = args [ 0 ]
pages = kwargs . get ( 'pages' , None )
if pages is None :
pages = ( 0 , pdfr . getNumPages ())
elif isinstance ( pages , PageRange ):
pages = pages . indices ( pdfr . getNumPages ())
elif not isinstance ( pages , tuple ):
raise TypeError ( '"pages" must be a tuple of (start, stop[, step])' )
for mp in self . pages [ position :( position + len ( range ( * pages )))]:
for mf in after_page_append :
mp . pagedata = mf ( mp . pagedata , pdfr )
def cm_remover ( locations , epsilon = 1e-5 ):
# locations must be n*6 numpy matrix
def remove_cm ( pg , pdfr ):
def matched_location ( loc ):
return np . any ( np . abs ( np . array ( loc ) - locations ). max ( axis = 1 ) < epsilon )
content_object , = pg [ "/Contents" ]. getObject ()
content = ContentStream ( content_object , pdfr )
for operands , operator in content . operations :
if operator == b_ ( "cm" ) and matched_location ([ i . as_numeric () for i in operands ]): # this is where magic happens
operands [:] = []
pg . __setitem__ ( NameObject ( '/Contents' ), content )
return pg
return remove_cm
target_locations = np . array ([ # find following information with InkScape, comparing the PyPDF2 outputs
[ 0.70711 , 0.70711 , - 0.70711 , 0.70711 , - 308.753 , - 165.279 ],
[ 1 , 0 , 0 , 1 , - 44.831 , 54.605 ]])
output = PdfFileModifyMerger ()
output . merge ( len ( output . pages ),
'/tmp/深度学习+中文版.pdf' ,
after_page_append = [ cm_remover ( target_locations )])
with open ( "/tmp/output.pdf" , "wb" ) as outputStream :
output . write ( outputStream )