@ -4,6 +4,8 @@ import re
import tempfile
import jinja2
import os
import shutil
import zipfile
from playwright . sync_api import sync_playwright
from itertools import groupby
@ -23,15 +25,15 @@ def ensure_colon(heading: str) -> str:
return heading
def main ( ) :
parser = argparse . ArgumentParser ( description = " Generate a PDF report from a TSV input using Playwright." )
parser = argparse . ArgumentParser ( description = " Generate a file of PDF reports from a TSV input using Playwright." )
parser . add_argument ( " -i " , " --input_tsv " , required = True , help = " Path to the input TSV file. " )
parser . add_argument ( " -o " , " --output_pdf " , required = True , help = " Path to the output PDF f ile ." )
parser . add_argument ( " -o " , " --output_pdf _folder " , required = True , help = " Path to the output PDF f older ." )
parser . add_argument ( " --browser " , choices = [ " chromium " , " firefox " , " webkit " ] , default = " chromium " ,
help = " Which browser engine to use (default: chromium). " )
args = parser . parse_args ( )
input_tsv = args . input_tsv
output_pdf = args . output_pdf
output_pdf _folder = args . output_pdf _folder
# 1. Read the TSV data
with open ( input_tsv , ' r ' , encoding = ' utf-8-sig ' ) as f :
@ -43,9 +45,6 @@ def main():
r . get ( " Sending locality: " , " " ) . strip ( ) ,
r . get ( " Name (last, first): " , " " ) . strip ( )
) )
# Group rows by locality to generate separate pdfs
grouped_rows = groupby ( rows , key = lambda r : r . get ( " Sending locality: " , " " ) . strip ( ) )
# Known columns to skip
skip_cols = {
@ -58,7 +57,7 @@ def main():
# 3. Process each row into a data structure for Jinja2
processed_rows = [ ]
for row in rows :
#name = row.get('Name (last, first):', '').strip() # TODO: Refactor: This isn't working because it's 'Name (last, first): '. Unused for now.
name = row . get ( ' Name (last, first): ' , ' ' ) . strip ( ) # Unused, maybe refactor
locality = row . get ( ' Sending locality: ' , ' ' ) . strip ( )
q_and_a = [ ]
@ -97,11 +96,14 @@ def main():
} )
processed_rows . append ( {
# "name": name ,
" name " : name ,
" locality " : locality ,
" entries " : q_and_a
} )
processed_rows = sorted ( processed_rows , key = lambda r : r [ " locality " ] ) # required for groupby
grouped_rows = groupby ( processed_rows , key = lambda r : r [ " locality " ] )
# 4. Create an HTML template (Jinja2)
# We'll use page-break-after so each row is on a new PDF page.
html_template_str = r """
@ -130,12 +132,6 @@ def main():
page - break - after : always ;
}
h1 {
font - size : 18 pt ;
margin : 0 0 0.5 em 0 ;
color : #1a365d;
}
h2 {
font - size : 14 pt ;
margin : 0 ;
@ -244,7 +240,6 @@ def main():
< body >
{ % for row in rows % }
< div class = " trainee-page " >
< h1 > { { row . name } } < / h1 >
< p class = " locality " > < span class = " locality-label " > Sending Locality : < / span > < span class = " locality-value " > { { row . locality } } < / span > < / p >
{ % for entry in row . entries % }
@ -289,35 +284,56 @@ def main():
< / html >
"""
# 5. Render the template with Jinja2
template = jinja2 . Template ( html_template_str )
rendered_html = template . render ( rows = processed_rows )
# Define the output folder and zip filename
output_dir = output_pdf_folder
zip_filename = f " { output_dir } .zip "
# Remove existing output zip file if it exists
if os . path . exists ( zip_filename ) :
os . remove ( zip_filename )
# 6. Convert the HTML to PDF using Playwright
# We'll create a temporary HTML file, open it in a headless browser, and save as PDF.
with tempfile . NamedTemporaryFile ( suffix = " .html " , delete = False ) as tmp_file :
tmp_html_path = tmp_file . name
tmp_file . write ( rendered_html . encode ( " utf-8 " ) )
tmp_file . flush ( )
# Create or reset the output folder
if os . path . exists ( output_dir ) :
shutil . rmtree ( output_dir )
os . makedirs ( output_dir , exist_ok = True )
# Render and export one PDF per locality
template = jinja2 . Template ( html_template_str )
with sync_playwright ( ) as p :
# launch the selected browser (chromium/firefox/webkit)
browser = p . __getattribute__ ( args . browser ) . launch ( )
context = browser . new_context ( )
page = context . new_page ( )
# Load the local HTML file
page . goto ( f " file:// { tmp_html_path } " )
for locality , group in grouped_rows :
entries = list ( group ) # Materialize the group iterator
rendered_html = template . render ( rows = entries )
# PDF Options: letter format, etc.
# For more options, see: https://playwright.dev/python/docs/api/class-page#pagepdfoptions
page . pdf (
path = output_pdf ,
format = " letter " ,
margin = { " top " : " 0.75in " , " right " : " 0.75in " , " bottom " : " 0.75in " , " left " : " 0.75in " }
)
with tempfile . NamedTemporaryFile ( suffix = " .html " , delete = False ) as tmp_file :
tmp_html_path = tmp_file . name
tmp_file . write ( rendered_html . encode ( " utf-8 " ) )
tmp_file . flush ( )
# Sanitize filename (remove/replace special characters)
safe_locality = re . sub ( r " [^ \ w \ -_. ] " , " _ " , locality ) . strip ( ) . replace ( " " , " _ " )
pdf_filename = f " { safe_locality } .pdf "
pdf_path = os . path . join ( output_dir , pdf_filename )
page = context . new_page ( )
page . goto ( f " file:// { tmp_html_path } " )
page . pdf (
path = pdf_path ,
format = " letter " ,
margin = { " top " : " 0.75in " , " right " : " 0.75in " , " bottom " : " 0.75in " , " left " : " 0.75in " }
)
os . remove ( tmp_html_path )
print ( f " ✅ Generated PDF: { pdf_path } " )
browser . close ( )
# 📦 Zip the output folder
shutil . make_archive ( output_dir , ' zip ' , output_dir )
print ( f " 📁 Zipped folder created: { zip_filename } " )
if __name__ == " __main__ " :
main ( )