Download All Hyperlinks in a PDF using Python

Contents

Introduction
Opening a PDF
Download to a File
Script to Download all PDFs

Introduction

Sometimes PDFs contain links that point to many other files (e.g. PDFs) of interest. It is not uncommon that these are a couple of hundred links. It would be a rather cumbersome task to click on all of them manually to download them. It is much easier to automate it and and download all files linked in a PDF automatically.

Opening a PDF

The biggest challenge dealing with PDFs is to open and parse them. The easiest way to open and manipulate PDFs in Python is pypdf. It can be installed using pip install pypdf

After installing it, PdfReader needs to be imported first.

from pypdf import PdfReader

Opening a PDF is straight forward, using reader = PdfReader('filepath'). A page is accessible via reader.pages[idx]. Due to the nature of PDFs, finding URIs is less straight forward as they are stored as “annotations”. To iterate over annotations, they need to be retrieved per page and iterated over:

reader = PdfReader('file_path')
page = reader.pages[0]
if page.annotations is not None:
    for j in range(len(page.annotations)):
        annot = page.annotations[j].get_object()

A single annotation might contain quite a lot but it should be of type dict and starts with /A. If it contains a URI, then /URI should be a key of such an annotation dict. The uri can then be filtered and processed otherwise.

if isinstance(annot, dict) and\
   '/A' in annot.keys() and\
   isinstance(annot['/A'], dict) and\
   '/URI' in annot['/A'].keys():
    uri = annot['/A']['/URI']
    # do something with the uri retrieved

Download to a File

Using urllib makes downloading straight forward and most importantly it does not introduce additional dependencies. Using basic file I/O capabilities, downloading and storing the result as a file is straight foward (NB!: this is not streaming to a file):

req = urllib.request.Request(url=uri, method='GET')
resp = urllib.request.urlopen(req)
if resp.status == 200:
    contents = resp.read()
    with open('output_file.ext'), 'wb') as f:
        f.write(contents)

Script to Download all PDFs

Let’s put it all together and download all pdfs that are linked in a pdf.

import argparse
import os
import random
import urllib
import urllib.request
import urllib.parse
import time

from pypdf import PdfReader
from tqdm import tqdm


def argument_parser() -> dict:
    """
    Argument Parser

    Parameters
    ----------
    None

    Returns
    -------
    config : dict
        configuration space
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-if',
        '--input_file',
        type=str,
        required=True,
        help='Input file'
    )
    parser.add_argument(
        '-df',
        '--download_folder',
        required=False,
        type=str,
        help='folder into which files should be downloaded'
    )
    parser.add_argument(
        '-dl',
        '--download',
        type=bool,
        default=False,
        help='should files be downloaded (default=False)'
    )
    parser.add_argument(
        '-ua',
        '--user_agent',
        type=str,
        default='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
        help='provide a custom user agent'
    )
    parser.add_argument(
        '-ext',
        '--ext',
        type=str,
        default=None,
        help='valid file extension of file that should be downloaded'
    )

    args = parser.parse_args()
    
    config = {}
    config['input_file'] = args.input_file
    config['download_files'] = args.download
    config['download_folder'] = args.download_folder
    config['user_agent'] = args.user_agent    
    config['file_ext'] = args.ext if args.ext is None else [args.ext]

    if config['download_files'] and config['download_folder'] is None:
        raise Exception("Download folder is required if files should be downloaded")
    return config


def parse_pdf(file_path: str,
              valid_file_ext: list[str] = None) -> list[str]:
    """
    Parsing the pdf and extract all URIs that match a
    provided file extension

    Parameters
    ----------
    file_path: str
        file path of pdf to open
    valid_file_ext: list[str]
        list of valid file extensions to be considered a valid URI

    Returns
    -------
    uris: list[str]
        list of URIs that matches the valid file extension
    """
    uris = []
    try:
        reader = PdfReader(file_path)
        page_count = len(reader.pages)
        for i in range(page_count):
            page = reader.pages[i]
            if page.annotations is not None:
                for j in range(len(page.annotations)):
                    annot = page.annotations[j].get_object()
                    if isinstance(annot, dict) and\
                       '/A' in annot.keys() and\
                       isinstance(annot['/A'], dict) and\
                       '/URI' in annot['/A'].keys():
                         uri = annot['/A']['/URI']
                         if valid_file_ext is not None:
                             ext = uri.split('.')[-1]
                             if ext in valid_file_ext:
                                 uris.append(uri)
                         else:
                             uris.append(uri)
    except Exception as error:
        print(error, flush=True)

    return sorted(list(set(uris)))


def download_files(uris: list[str],
                   output_folder: str,
                   user_agent: str = None) -> bool:
    """
    Downloads all files provided to a designated output folder

    Parameters
    ----------
    uris: list[str]
        list of uris to be downloaded
    output_folder: str
        designated output folder to download files to
    user_agent: str
        user_agent to be used for downloading

    Returns
    -------
    success: bool
        True if no error encountered
    """
    success = False
    if user_agent is None:
        user_agent = 'curl'
    req_header = {'User-Agent': user_agent}
    success = True
    for uri in tqdm(uris):
        try:
            fn = uri.split('/')[-1]
            req = urllib.request.Request(url=uri, headers=req_header, method='GET')
            resp = urllib.request.urlopen(req)
            if resp.status == 200:
                contents = resp.read()
                with open(os.path.join(output_folder, fn), 'wb') as f:
                    f.write(contents)
        except Exception as error:
            print(error, flush=True)
            print(f"({uri})", flush=True)
            success = False

        # avoid running into rate limiters
        time.sleep(1.1 * random.random())

    return success


def main():
    print('=' * 72)
    config = argument_parser()
    if not os.path.exists(config['input_file']) or \
       not os.path.isfile(config['input_file']):
        raise Exception('Invalid input file')
    if config['download_files']:
        if not os.path.exists(config['download_folder']):
            os.makedirs(config['download_folder'], exist_ok=True)
        else:
            if not os.path.isdir(config['download_folder']):
                raise Exception('download folder exists as file')

    uris = parse_pdf(config['input_file'], config['file_ext'])
    if config['download_files']:
        download_files(uris, config['download_folder'], config['user_agent'])
    print('=' * 72)


if __name__ == '__main__':
    main()