added file appendix -1

2024-11-21 10:26:13 +01:00 · 2024-11-21 10:26:13 +01:00 · 63e6f2aea9
commit 63e6f2aea9
parent 9a769907b4
2 changed files with 94 additions and 208 deletions
--- a/email-pdf-downloader-new.py
+++ b/email-pdf-downloader-new.py
@ -1,178 +0,0 @@
-import os
-import base64
-import requests
-import logging
-import hashlib
-import re
-from urllib.parse import parse_qs, urlparse
-from google.auth.transport.requests import Request
-from google.oauth2.credentials import Credentials
-from google_auth_oauthlib.flow import InstalledAppFlow
-from googleapiclient.discovery import build
-from bs4 import BeautifulSoup
-
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-
-SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify']
-SEARCH_QUERY = 'Amazon Kindle Support'
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-
-def get_credentials():
-    creds = None
-    token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json')
-    credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json')
-
-    if os.path.exists(token_path):
-        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
-    if not creds or not creds.valid:
-        try:
-            if creds and creds.expired and creds.refresh_token:
-                creds.refresh(Request())
-            else:
-                flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
-                creds = flow.run_local_server(port=0)
-            with open(token_path, 'w') as token:
-                token.write(creds.to_json())
-        except Exception as e:
-            logging.error(f"Failed to obtain credentials: {e}")
-            return None
-    return creds
-
-def fetch_unread_messages(service, max_results=10):
-    try:
-        results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute()
-        return results.get('messages', [])
-    except Exception as error:
-        logging.error(f"Error fetching unread messages: {error}")
-        return []
-
-def calculate_file_hash(file_path):
-    """Calculate SHA-256 hash of a file."""
-    sha256_hash = hashlib.sha256()
-    try:
-        with open(file_path, "rb") as f:
-            for byte_block in iter(lambda: f.read(4096), b""):
-                sha256_hash.update(byte_block)
-    except Exception as e:
-        logging.error(f"Error reading file {file_path} for hashing: {e}")
-        return None
-    return sha256_hash.hexdigest()
-
-def calculate_content_hash(content):
-    """Calculate SHA-256 hash of content."""
-    sha256_hash = hashlib.sha256()
-    sha256_hash.update(content)
-    return sha256_hash.hexdigest()
-
-def get_existing_versions(file_path):
-    """Get all existing versions of a file (with suffixes) in the same directory."""
-    base, ext = os.path.splitext(file_path)
-    dir_path = os.path.dirname(file_path)
-    base_name = os.path.basename(base)
-    
-    existing_files = []
-    for filename in os.listdir(dir_path):
-        if filename.startswith(base_name) and filename.endswith(ext):
-            # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself
-            if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename):
-                existing_files.append(os.path.join(dir_path, filename))
-    return existing_files
-
-def get_unique_file_path(file_path):
-    """Generate a unique file path by adding a suffix if the file already exists."""
-    base, ext = os.path.splitext(file_path)
-    counter = 1
-    new_file_path = f"{base}-{counter}{ext}"
-    while os.path.exists(new_file_path):
-        counter += 1
-        new_file_path = f"{base}-{counter}{ext}"
-    return new_file_path
-
-def download_pdf(pdf_link):
-    parsed_url = urlparse(pdf_link)
-    query_params = parse_qs(parsed_url.query)
-    actual_file_url = query_params.get('U', [None])[0]
-    if not actual_file_url:
-        logging.error("No valid file URL found in PDF link.")
-        return
-
-    file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
-    data_dir = os.path.join(SCRIPT_DIR, "data")
-    file_path = os.path.join(data_dir, file_name)
-
-    try:
-        response = requests.get(actual_file_url, timeout=10)
-        if response.status_code == 200:
-            new_content_hash = calculate_content_hash(response.content)
-
-            # Check all existing versions of the file
-            existing_files = get_existing_versions(file_path)
-            for existing_file in existing_files:
-                existing_file_hash = calculate_file_hash(existing_file)
-                if existing_file_hash == new_content_hash:
-                    logging.info(f"An identical file already exists as {existing_file}. Skipping download.")
-                    return
-
-            # No identical file found, save as a new version
-            os.makedirs(data_dir, exist_ok=True)
-            if os.path.exists(file_path):
-                # If base file exists, find a unique file name with suffix
-                file_path = get_unique_file_path(file_path)
-            with open(file_path, "wb") as file:
-                file.write(response.content)
-            logging.info(f"File downloaded and saved to {file_path}")
-        else:
-            logging.error(f"Failed to download the file. Status code: {response.status_code}")
-    except requests.exceptions.Timeout:
-        logging.error("Request timed out while downloading PDF.")
-    except requests.exceptions.RequestException as e:
-        logging.error(f"An error occurred during file download: {e}")
-
-def process_email(service, message):
-    try:
-        msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
-        headers = msg['payload']['headers']
-        if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'):
-            mail_body = ''
-            for part in msg.get('payload', {}).get('parts', []):
-                try:
-                    data = part['body']["data"]
-                    byte_code = base64.urlsafe_b64decode(data)
-                    mail_body += byte_code.decode("utf-8")
-                except Exception:
-                    continue
-
-            soup = BeautifulSoup(mail_body, "html.parser")
-            link = soup.find("a", string="PDF herunterladen")
-            if link:
-                pdf_link = link.get("href")
-                download_pdf(pdf_link)
-                
-                service.users().messages().delete(userId='me', id=message['id']).execute()
-                logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.")
-            else:
-                logging.info("No 'PDF herunterladen' link found in this email.")
-    except Exception as error:
-        logging.error(f"An error occurred while processing email ID {message['id']}: {error}")
-
-def read_emails():
-    creds = get_credentials()
-    if not creds:
-        logging.error("No valid credentials found.")
-        return
-
-    try:
-        service = build('gmail', 'v1', credentials=creds)
-        messages = fetch_unread_messages(service)
-
-        if not messages:
-            logging.info("No new messages.")
-        else:
-            for message in messages:
-                process_email(service, message)
-    except Exception as e:
-        logging.error(f"Failed to initialize Gmail service: {e}")
-
-if __name__ == "__main__":
-    read_emails()
--- a/email_pdf_downloader.py
+++ b/email_pdf_downloader.py
@ -2,6 +2,8 @@ import os
 import base64
 import requests
 import logging
+import hashlib
+import re
 from urllib.parse import parse_qs, urlparse
 from google.auth.transport.requests import Request
 from google.oauth2.credentials import Credentials
@ -18,16 +20,23 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

 def get_credentials():
    creds = None
-    if os.path.exists(SCRIPT_DIR+'/token.json'):
-        creds = Credentials.from_authorized_user_file(SCRIPT_DIR+'/token.json', SCOPES)
+    token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json')
+    credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json')
+
+    if os.path.exists(token_path):
+        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
    if not creds or not creds.valid:
+        try:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
-            flow = InstalledAppFlow.from_client_secrets_file(SCRIPT_DIR+'/credentials.json', SCOPES)
+                flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
                creds = flow.run_local_server(port=0)
-        with open(SCRIPT_DIR+'/token.json', 'w') as token:
+            with open(token_path, 'w') as token:
                token.write(creds.to_json())
+        except Exception as e:
+            logging.error(f"Failed to obtain credentials: {e}")
+            return None
    return creds

 def fetch_unread_messages(service, max_results=10):
@ -38,8 +47,49 @@ def fetch_unread_messages(service, max_results=10):
        logging.error(f"Error fetching unread messages: {error}")
        return []

+def calculate_file_hash(file_path):
+    """Calculate SHA-256 hash of a file."""
+    sha256_hash = hashlib.sha256()
+    try:
+        with open(file_path, "rb") as f:
+            for byte_block in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(byte_block)
+    except Exception as e:
+        logging.error(f"Error reading file {file_path} for hashing: {e}")
+        return None
+    return sha256_hash.hexdigest()
+
+def calculate_content_hash(content):
+    """Calculate SHA-256 hash of content."""
+    sha256_hash = hashlib.sha256()
+    sha256_hash.update(content)
+    return sha256_hash.hexdigest()
+
+def get_existing_versions(file_path):
+    """Get all existing versions of a file (with suffixes) in the same directory."""
+    base, ext = os.path.splitext(file_path)
+    dir_path = os.path.dirname(file_path)
+    base_name = os.path.basename(base)
+    
+    existing_files = []
+    for filename in os.listdir(dir_path):
+        if filename.startswith(base_name) and filename.endswith(ext):
+            # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself
+            if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename):
+                existing_files.append(os.path.join(dir_path, filename))
+    return existing_files
+
+def get_unique_file_path(file_path):
+    """Generate a unique file path by adding a suffix if the file already exists."""
+    base, ext = os.path.splitext(file_path)
+    counter = 1
+    new_file_path = f"{base}-{counter}{ext}"
+    while os.path.exists(new_file_path):
+        counter += 1
+        new_file_path = f"{base}-{counter}{ext}"
+    return new_file_path
+
 def download_pdf(pdf_link):
-    # Parse URL and get the actual file URL
    parsed_url = urlparse(pdf_link)
    query_params = parse_qs(parsed_url.query)
    actual_file_url = query_params.get('U', [None])[0]
@ -47,27 +97,36 @@ def download_pdf(pdf_link):
        logging.error("No valid file URL found in PDF link.")
        return

-    # Extract the file name
    file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
    data_dir = os.path.join(SCRIPT_DIR, "data")
    file_path = os.path.join(data_dir, file_name)

-    # Check if file exists
-    if os.path.exists(file_path):
-        logging.info(f"{file_name} already exists. Skipping download.")
+    try:
+        response = requests.get(actual_file_url, timeout=10)
+        if response.status_code == 200:
+            new_content_hash = calculate_content_hash(response.content)
+
+            # Check all existing versions of the file
+            existing_files = get_existing_versions(file_path)
+            for existing_file in existing_files:
+                existing_file_hash = calculate_file_hash(existing_file)
+                if existing_file_hash == new_content_hash:
+                    logging.info(f"An identical file already exists as {existing_file}. Skipping download.")
                    return

-    # Download and save the file
-    try:
-        response = requests.get(actual_file_url)
-        if response.status_code == 200:
+            # No identical file found, save as a new version
            os.makedirs(data_dir, exist_ok=True)
+            if os.path.exists(file_path):
+                # If base file exists, find a unique file name with suffix
+                file_path = get_unique_file_path(file_path)
            with open(file_path, "wb") as file:
                file.write(response.content)
            logging.info(f"File downloaded and saved to {file_path}")
        else:
            logging.error(f"Failed to download the file. Status code: {response.status_code}")
-    except Exception as e:
+    except requests.exceptions.Timeout:
+        logging.error("Request timed out while downloading PDF.")
+    except requests.exceptions.RequestException as e:
        logging.error(f"An error occurred during file download: {e}")

 def process_email(service, message):
@ -84,23 +143,26 @@ def process_email(service, message):
                except Exception:
                    continue

-            # Parse HTML and find the PDF link
            soup = BeautifulSoup(mail_body, "html.parser")
            link = soup.find("a", string="PDF herunterladen")
            if link:
                pdf_link = link.get("href")
                download_pdf(pdf_link)
                
-                # After successful download, delete the email
                service.users().messages().delete(userId='me', id=message['id']).execute()
                logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.")
            else:
                logging.info("No 'PDF herunterladen' link found in this email.")
    except Exception as error:
-        logging.error(f"An error occurred while processing email: {error}")
+        logging.error(f"An error occurred while processing email ID {message['id']}: {error}")

 def read_emails():
    creds = get_credentials()
+    if not creds:
+        logging.error("No valid credentials found.")
+        return
+
+    try:
        service = build('gmail', 'v1', credentials=creds)
        messages = fetch_unread_messages(service)

@ -109,6 +171,8 @@ def read_emails():
        else:
            for message in messages:
                process_email(service, message)
+    except Exception as e:
+        logging.error(f"Failed to initialize Gmail service: {e}")

 if __name__ == "__main__":
    read_emails()