Contents
Introduction
Sometimes PDFs contain links that point to many other files (e.g. PDFs) of interest. It is not uncommon that these are a couple of hundred links. It would be a rather cumbersome task to click on all of them manually to download them. It is much easier to automate it and and download all files linked in a PDF automatically.
Opening a PDF
The biggest challenge dealing with PDFs is to open and parse them. The easiest way to open and manipulate PDFs in Python is pypdf
. It can be installed using pip install pypdf
After installing it, PdfReader
needs to be imported first.
from pypdf import PdfReader
Opening a PDF is straight forward, using reader = PdfReader('filepath')
. A page is accessible via reader.pages[idx]
. Due to the nature of PDFs, finding URIs is less straight forward as they are stored as “annotations”. To iterate over annotations, they need to be retrieved per page and iterated over:
reader = PdfReader('file_path')
page = reader.pages[0]
if page.annotations is not None:
for j in range(len(page.annotations)):
annot = page.annotations[j].get_object()
A single annotation might contain quite a lot but it should be of type dict
and starts with /A
. If it contains a URI, then /URI
should be a key of such an annotation dict. The uri can then be filtered and processed otherwise.
if isinstance(annot, dict) and\
'/A' in annot.keys() and\
isinstance(annot['/A'], dict) and\
'/URI' in annot['/A'].keys():
uri = annot['/A']['/URI']
# do something with the uri retrieved
Download to a File
Using urllib makes downloading straight forward and most importantly it does not introduce additional dependencies. Using basic file I/O capabilities, downloading and storing the result as a file is straight foward (NB!: this is not streaming to a file):
req = urllib.request.Request(url=uri, method='GET')
resp = urllib.request.urlopen(req)
if resp.status == 200:
contents = resp.read()
with open('output_file.ext'), 'wb') as f:
f.write(contents)
Script to Download all PDFs
Let’s put it all together and download all pdfs that are linked in a pdf.
import argparse
import os
import random
import urllib
import urllib.request
import urllib.parse
import time
from pypdf import PdfReader
from tqdm import tqdm
def argument_parser() -> dict:
"""
Argument Parser
Parameters
----------
None
Returns
-------
config : dict
configuration space
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'-if',
'--input_file',
type=str,
required=True,
help='Input file'
)
parser.add_argument(
'-df',
'--download_folder',
required=False,
type=str,
help='folder into which files should be downloaded'
)
parser.add_argument(
'-dl',
'--download',
type=bool,
default=False,
help='should files be downloaded (default=False)'
)
parser.add_argument(
'-ua',
'--user_agent',
type=str,
default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
help='provide a custom user agent'
)
parser.add_argument(
'-ext',
'--ext',
type=str,
default=None,
help='valid file extension of file that should be downloaded'
)
args = parser.parse_args()
config = {}
config['input_file'] = args.input_file
config['download_files'] = args.download
config['download_folder'] = args.download_folder
config['user_agent'] = args.user_agent
config['file_ext'] = args.ext if args.ext is None else [args.ext]
if config['download_files'] and config['download_folder'] is None:
raise Exception("Download folder is required if files should be downloaded")
return config
def parse_pdf(file_path: str,
valid_file_ext: list[str] = None) -> list[str]:
"""
Parsing the pdf and extract all URIs that match a
provided file extension
Parameters
----------
file_path: str
file path of pdf to open
valid_file_ext: list[str]
list of valid file extensions to be considered a valid URI
Returns
-------
uris: list[str]
list of URIs that matches the valid file extension
"""
uris = []
try:
reader = PdfReader(file_path)
page_count = len(reader.pages)
for i in range(page_count):
page = reader.pages[i]
if page.annotations is not None:
for j in range(len(page.annotations)):
annot = page.annotations[j].get_object()
if isinstance(annot, dict) and\
'/A' in annot.keys() and\
isinstance(annot['/A'], dict) and\
'/URI' in annot['/A'].keys():
uri = annot['/A']['/URI']
if valid_file_ext is not None:
ext = uri.split('.')[-1]
if ext in valid_file_ext:
uris.append(uri)
else:
uris.append(uri)
except Exception as error:
print(error, flush=True)
return sorted(list(set(uris)))
def download_files(uris: list[str],
output_folder: str,
user_agent: str = None) -> bool:
"""
Downloads all files provided to a designated output folder
Parameters
----------
uris: list[str]
list of uris to be downloaded
output_folder: str
designated output folder to download files to
user_agent: str
user_agent to be used for downloading
Returns
-------
success: bool
True if no error encountered
"""
success = False
if user_agent is None:
user_agent = 'curl'
req_header = {'User-Agent': user_agent}
success = True
for uri in tqdm(uris):
try:
fn = uri.split('/')[-1]
req = urllib.request.Request(url=uri, headers=req_header, method='GET')
resp = urllib.request.urlopen(req)
if resp.status == 200:
contents = resp.read()
with open(os.path.join(output_folder, fn), 'wb') as f:
f.write(contents)
except Exception as error:
print(error, flush=True)
print(f"({uri})", flush=True)
success = False
# avoid running into rate limiters
time.sleep(1.1 * random.random())
return success
def main():
print('=' * 72)
config = argument_parser()
if not os.path.exists(config['input_file']) or \
not os.path.isfile(config['input_file']):
raise Exception('Invalid input file')
if config['download_files']:
if not os.path.exists(config['download_folder']):
os.makedirs(config['download_folder'], exist_ok=True)
else:
if not os.path.isdir(config['download_folder']):
raise Exception('download folder exists as file')
uris = parse_pdf(config['input_file'], config['file_ext'])
if config['download_files']:
download_files(uris, config['download_folder'], config['user_agent'])
print('=' * 72)
if __name__ == '__main__':
main()