#!/usr/bin/python
"""
  Python script to upload a PDF to the function called by https://docs2info.com/demo.

  This script has been tested with Python 3.8.5 on a Mac.

  Please send problem reports and other feedback to john@docs2info.com
"""
import requests
import json
import sys
import csv
import os

HOST = 'https://pdftables-2uqbalu5ya-uw.a.run.app/'
URL = HOST + 'upload'
OUT_DIR = 'extracted.tables'
MIN_WIDTH = 2 # Minimum table width
MIN_HEIGHT = 2 # Minimum table height

def uploadPDF(path):
  """
    uploadPDF uploads the PDF in path to a table extraction API and returns a dict of extracted
	data as a JSON string.

    The returned JSON dict has keys
      'NumberPages': Number of pages in PDF.
      'FirstPageProcessed': First page of PDF processed (1 - 'NumberPages').
      'LastPageProcessed': Las page of PDF processed (1 - 'NumberPages').
      'PageTables': The tables found.
    jsonToDoc below converts the JSON to a python dict.
  """
  multiple_files = [('uploaded-file', (path, open(path, 'rb'), 'application/pdf'))]
  r = requests.post(URL, files=multiple_files)
  if r.status_code != 200:
    print(f"uploadPDF failed. r={r}")
    return None
  return r.text

def key2int(d, k):
  """
    key2int converts key k in dict d to an int.
  """
  d[int(k)] = d[k]
  del d[k]

def val2int(d, k):
  """
    val2int converts d[k] to an int.
  """
  d[k] = int(d[k])

def jsonToDoc(text):
  """
    jsonToDoc converts json string text to a dict describing extracted tables.
  """
  doc = json.loads(text)
  for k in ('NumberPages', 'FirstPageProcessed', 'LastPageProcessed'):
    val2int(doc, k)
  for k in list(doc['PageTables'].keys()):
    key2int(doc['PageTables'], k)
  for k in list(doc['PageTables'].keys()):
    for i in range(len(doc['PageTables'][k])):
      val2int(doc['PageTables'][k][i], 'Width')
      val2int(doc['PageTables'][k][i], 'Height')
  return doc

def makeOutRoot(out_dir, in_path):
   """
     makeOutRoot returns a path in directory out_dir with same base name as in_path.
   """
   name = os.path.basename(in_path)
   name, _ = os.path.splitext(name)
   return os.path.join(out_dir, name)

def writeCsvs(out_root, doc, min_width, min_height):
  """
    writeCsvs writes the tables in dict doc to CSV files with names starting with out_root.
    min_width is the minimum table width.
    min_height is the minimum table height.
  """
  doc_tables = doc['PageTables']
  for page_num in sorted(doc_tables.keys()):
    page_tables = doc_tables[page_num]
    table_num = 1
    for table in page_tables:
      width = table['Width']
      height = table['Height']
      if width < min_width or height < min_height:
       continue
      out_path = f'{out_root}.page{page_num:03d}.table{table_num:03d}.csv'
      data = table['Data']
      with open(out_path, 'w') as f:
        writer = csv.writer(f)
        for row in data:
          writer.writerow(row)
      print(f"Wrote a {width:2d} x {height:2d} table to '{out_path}'")
      table_num += 1

if len(sys.argv) < 2:
  print("Usage: python extract_tables.py <PDF path>")
  exit(1)
os.makedirs(OUT_DIR, exist_ok=True)
for in_path in sys.argv[1:]:
  text = uploadPDF(in_path)
  if not text:
     continue
  doc = jsonToDoc(text)
  out_root = makeOutRoot(OUT_DIR, in_path)
  writeCsvs(out_root, doc, MIN_WIDTH, MIN_HEIGHT)
