From e6220db5170c019432e3379294591d0d683481e4 Mon Sep 17 00:00:00 2001
From: maru21 <k.muhrer@gmail.com>
Date: Wed, 13 Nov 2024 22:11:31 +0100
Subject: [PATCH] Add pdf-server-new.py

---
 pdf-server-new.py | 178 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 pdf-server-new.py

diff --git a/pdf-server-new.py b/pdf-server-new.py
new file mode 100644
index 0000000..641b2a4
--- /dev/null
+++ b/pdf-server-new.py
@@ -0,0 +1,178 @@
+import os
+import base64
+import requests
+import logging
+import hashlib
+import re
+from urllib.parse import parse_qs, urlparse
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
+from bs4 import BeautifulSoup
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify']
+SEARCH_QUERY = 'Amazon Kindle Support'
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+def get_credentials():
+    creds = None
+    token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json')
+    credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json')
+
+    if os.path.exists(token_path):
+        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
+    if not creds or not creds.valid:
+        try:
+            if creds and creds.expired and creds.refresh_token:
+                creds.refresh(Request())
+            else:
+                flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
+                creds = flow.run_local_server(port=0)
+            with open(token_path, 'w') as token:
+                token.write(creds.to_json())
+        except Exception as e:
+            logging.error(f"Failed to obtain credentials: {e}")
+            return None
+    return creds
+
+def fetch_unread_messages(service, max_results=10):
+    try:
+        results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute()
+        return results.get('messages', [])
+    except Exception as error:
+        logging.error(f"Error fetching unread messages: {error}")
+        return []
+
+def calculate_file_hash(file_path):
+    """Calculate SHA-256 hash of a file."""
+    sha256_hash = hashlib.sha256()
+    try:
+        with open(file_path, "rb") as f:
+            for byte_block in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(byte_block)
+    except Exception as e:
+        logging.error(f"Error reading file {file_path} for hashing: {e}")
+        return None
+    return sha256_hash.hexdigest()
+
+def calculate_content_hash(content):
+    """Calculate SHA-256 hash of content."""
+    sha256_hash = hashlib.sha256()
+    sha256_hash.update(content)
+    return sha256_hash.hexdigest()
+
+def get_existing_versions(file_path):
+    """Get all existing versions of a file (with suffixes) in the same directory."""
+    base, ext = os.path.splitext(file_path)
+    dir_path = os.path.dirname(file_path)
+    base_name = os.path.basename(base)
+    
+    existing_files = []
+    for filename in os.listdir(dir_path):
+        if filename.startswith(base_name) and filename.endswith(ext):
+            # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself
+            if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename):
+                existing_files.append(os.path.join(dir_path, filename))
+    return existing_files
+
+def get_unique_file_path(file_path):
+    """Generate a unique file path by adding a suffix if the file already exists."""
+    base, ext = os.path.splitext(file_path)
+    counter = 1
+    new_file_path = f"{base}-{counter}{ext}"
+    while os.path.exists(new_file_path):
+        counter += 1
+        new_file_path = f"{base}-{counter}{ext}"
+    return new_file_path
+
+def download_pdf(pdf_link):
+    parsed_url = urlparse(pdf_link)
+    query_params = parse_qs(parsed_url.query)
+    actual_file_url = query_params.get('U', [None])[0]
+    if not actual_file_url:
+        logging.error("No valid file URL found in PDF link.")
+        return
+
+    file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
+    data_dir = os.path.join(SCRIPT_DIR, "data")
+    file_path = os.path.join(data_dir, file_name)
+
+    try:
+        response = requests.get(actual_file_url, timeout=10)
+        if response.status_code == 200:
+            new_content_hash = calculate_content_hash(response.content)
+
+            # Check all existing versions of the file
+            existing_files = get_existing_versions(file_path)
+            for existing_file in existing_files:
+                existing_file_hash = calculate_file_hash(existing_file)
+                if existing_file_hash == new_content_hash:
+                    logging.info(f"An identical file already exists as {existing_file}. Skipping download.")
+                    return
+
+            # No identical file found, save as a new version
+            os.makedirs(data_dir, exist_ok=True)
+            if os.path.exists(file_path):
+                # If base file exists, find a unique file name with suffix
+                file_path = get_unique_file_path(file_path)
+            with open(file_path, "wb") as file:
+                file.write(response.content)
+            logging.info(f"File downloaded and saved to {file_path}")
+        else:
+            logging.error(f"Failed to download the file. Status code: {response.status_code}")
+    except requests.exceptions.Timeout:
+        logging.error("Request timed out while downloading PDF.")
+    except requests.exceptions.RequestException as e:
+        logging.error(f"An error occurred during file download: {e}")
+
+def process_email(service, message):
+    try:
+        msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
+        headers = msg['payload']['headers']
+        if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'):
+            mail_body = ''
+            for part in msg.get('payload', {}).get('parts', []):
+                try:
+                    data = part['body']["data"]
+                    byte_code = base64.urlsafe_b64decode(data)
+                    mail_body += byte_code.decode("utf-8")
+                except Exception:
+                    continue
+
+            soup = BeautifulSoup(mail_body, "html.parser")
+            link = soup.find("a", string="PDF herunterladen")
+            if link:
+                pdf_link = link.get("href")
+                download_pdf(pdf_link)
+                
+                service.users().messages().delete(userId='me', id=message['id']).execute()
+                logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.")
+            else:
+                logging.info("No 'PDF herunterladen' link found in this email.")
+    except Exception as error:
+        logging.error(f"An error occurred while processing email ID {message['id']}: {error}")
+
+def read_emails():
+    creds = get_credentials()
+    if not creds:
+        logging.error("No valid credentials found.")
+        return
+
+    try:
+        service = build('gmail', 'v1', credentials=creds)
+        messages = fetch_unread_messages(service)
+
+        if not messages:
+            logging.info("No new messages.")
+        else:
+            for message in messages:
+                process_email(service, message)
+    except Exception as e:
+        logging.error(f"Failed to initialize Gmail service: {e}")
+
+if __name__ == "__main__":
+    read_emails()
\ No newline at end of file