diff --git a/email-pdf-downloader-new.py b/email-pdf-downloader-new.py deleted file mode 100644 index 641b2a4..0000000 --- a/email-pdf-downloader-new.py +++ /dev/null @@ -1,178 +0,0 @@ -import os -import base64 -import requests -import logging -import hashlib -import re -from urllib.parse import parse_qs, urlparse -from google.auth.transport.requests import Request -from google.oauth2.credentials import Credentials -from google_auth_oauthlib.flow import InstalledAppFlow -from googleapiclient.discovery import build -from bs4 import BeautifulSoup - -# Configure logging -logging.basicConfig(level=logging.INFO) - -SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify'] -SEARCH_QUERY = 'Amazon Kindle Support' -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) - -def get_credentials(): - creds = None - token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json') - credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json') - - if os.path.exists(token_path): - creds = Credentials.from_authorized_user_file(token_path, SCOPES) - if not creds or not creds.valid: - try: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES) - creds = flow.run_local_server(port=0) - with open(token_path, 'w') as token: - token.write(creds.to_json()) - except Exception as e: - logging.error(f"Failed to obtain credentials: {e}") - return None - return creds - -def fetch_unread_messages(service, max_results=10): - try: - results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute() - return results.get('messages', []) - except Exception as error: - logging.error(f"Error fetching unread messages: {error}") - return [] - -def calculate_file_hash(file_path): - """Calculate SHA-256 hash of a file.""" - sha256_hash = hashlib.sha256() - try: - with open(file_path, "rb") as f: - for byte_block in iter(lambda: f.read(4096), b""): - sha256_hash.update(byte_block) - except Exception as e: - logging.error(f"Error reading file {file_path} for hashing: {e}") - return None - return sha256_hash.hexdigest() - -def calculate_content_hash(content): - """Calculate SHA-256 hash of content.""" - sha256_hash = hashlib.sha256() - sha256_hash.update(content) - return sha256_hash.hexdigest() - -def get_existing_versions(file_path): - """Get all existing versions of a file (with suffixes) in the same directory.""" - base, ext = os.path.splitext(file_path) - dir_path = os.path.dirname(file_path) - base_name = os.path.basename(base) - - existing_files = [] - for filename in os.listdir(dir_path): - if filename.startswith(base_name) and filename.endswith(ext): - # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself - if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename): - existing_files.append(os.path.join(dir_path, filename)) - return existing_files - -def get_unique_file_path(file_path): - """Generate a unique file path by adding a suffix if the file already exists.""" - base, ext = os.path.splitext(file_path) - counter = 1 - new_file_path = f"{base}-{counter}{ext}" - while os.path.exists(new_file_path): - counter += 1 - new_file_path = f"{base}-{counter}{ext}" - return new_file_path - -def download_pdf(pdf_link): - parsed_url = urlparse(pdf_link) - query_params = parse_qs(parsed_url.query) - actual_file_url = query_params.get('U', [None])[0] - if not actual_file_url: - logging.error("No valid file URL found in PDF link.") - return - - file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf" - data_dir = os.path.join(SCRIPT_DIR, "data") - file_path = os.path.join(data_dir, file_name) - - try: - response = requests.get(actual_file_url, timeout=10) - if response.status_code == 200: - new_content_hash = calculate_content_hash(response.content) - - # Check all existing versions of the file - existing_files = get_existing_versions(file_path) - for existing_file in existing_files: - existing_file_hash = calculate_file_hash(existing_file) - if existing_file_hash == new_content_hash: - logging.info(f"An identical file already exists as {existing_file}. Skipping download.") - return - - # No identical file found, save as a new version - os.makedirs(data_dir, exist_ok=True) - if os.path.exists(file_path): - # If base file exists, find a unique file name with suffix - file_path = get_unique_file_path(file_path) - with open(file_path, "wb") as file: - file.write(response.content) - logging.info(f"File downloaded and saved to {file_path}") - else: - logging.error(f"Failed to download the file. Status code: {response.status_code}") - except requests.exceptions.Timeout: - logging.error("Request timed out while downloading PDF.") - except requests.exceptions.RequestException as e: - logging.error(f"An error occurred during file download: {e}") - -def process_email(service, message): - try: - msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute() - headers = msg['payload']['headers'] - if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'): - mail_body = '' - for part in msg.get('payload', {}).get('parts', []): - try: - data = part['body']["data"] - byte_code = base64.urlsafe_b64decode(data) - mail_body += byte_code.decode("utf-8") - except Exception: - continue - - soup = BeautifulSoup(mail_body, "html.parser") - link = soup.find("a", string="PDF herunterladen") - if link: - pdf_link = link.get("href") - download_pdf(pdf_link) - - service.users().messages().delete(userId='me', id=message['id']).execute() - logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.") - else: - logging.info("No 'PDF herunterladen' link found in this email.") - except Exception as error: - logging.error(f"An error occurred while processing email ID {message['id']}: {error}") - -def read_emails(): - creds = get_credentials() - if not creds: - logging.error("No valid credentials found.") - return - - try: - service = build('gmail', 'v1', credentials=creds) - messages = fetch_unread_messages(service) - - if not messages: - logging.info("No new messages.") - else: - for message in messages: - process_email(service, message) - except Exception as e: - logging.error(f"Failed to initialize Gmail service: {e}") - -if __name__ == "__main__": - read_emails() \ No newline at end of file diff --git a/email_pdf_downloader.py b/email_pdf_downloader.py index 0512f27..641b2a4 100644 --- a/email_pdf_downloader.py +++ b/email_pdf_downloader.py @@ -2,6 +2,8 @@ import os import base64 import requests import logging +import hashlib +import re from urllib.parse import parse_qs, urlparse from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials @@ -18,16 +20,23 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) def get_credentials(): creds = None - if os.path.exists(SCRIPT_DIR+'/token.json'): - creds = Credentials.from_authorized_user_file(SCRIPT_DIR+'/token.json', SCOPES) + token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json') + credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json') + + if os.path.exists(token_path): + creds = Credentials.from_authorized_user_file(token_path, SCOPES) if not creds or not creds.valid: - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - else: - flow = InstalledAppFlow.from_client_secrets_file(SCRIPT_DIR+'/credentials.json', SCOPES) - creds = flow.run_local_server(port=0) - with open(SCRIPT_DIR+'/token.json', 'w') as token: - token.write(creds.to_json()) + try: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES) + creds = flow.run_local_server(port=0) + with open(token_path, 'w') as token: + token.write(creds.to_json()) + except Exception as e: + logging.error(f"Failed to obtain credentials: {e}") + return None return creds def fetch_unread_messages(service, max_results=10): @@ -38,8 +47,49 @@ def fetch_unread_messages(service, max_results=10): logging.error(f"Error fetching unread messages: {error}") return [] +def calculate_file_hash(file_path): + """Calculate SHA-256 hash of a file.""" + sha256_hash = hashlib.sha256() + try: + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + except Exception as e: + logging.error(f"Error reading file {file_path} for hashing: {e}") + return None + return sha256_hash.hexdigest() + +def calculate_content_hash(content): + """Calculate SHA-256 hash of content.""" + sha256_hash = hashlib.sha256() + sha256_hash.update(content) + return sha256_hash.hexdigest() + +def get_existing_versions(file_path): + """Get all existing versions of a file (with suffixes) in the same directory.""" + base, ext = os.path.splitext(file_path) + dir_path = os.path.dirname(file_path) + base_name = os.path.basename(base) + + existing_files = [] + for filename in os.listdir(dir_path): + if filename.startswith(base_name) and filename.endswith(ext): + # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself + if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename): + existing_files.append(os.path.join(dir_path, filename)) + return existing_files + +def get_unique_file_path(file_path): + """Generate a unique file path by adding a suffix if the file already exists.""" + base, ext = os.path.splitext(file_path) + counter = 1 + new_file_path = f"{base}-{counter}{ext}" + while os.path.exists(new_file_path): + counter += 1 + new_file_path = f"{base}-{counter}{ext}" + return new_file_path + def download_pdf(pdf_link): - # Parse URL and get the actual file URL parsed_url = urlparse(pdf_link) query_params = parse_qs(parsed_url.query) actual_file_url = query_params.get('U', [None])[0] @@ -47,27 +97,36 @@ def download_pdf(pdf_link): logging.error("No valid file URL found in PDF link.") return - # Extract the file name file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf" data_dir = os.path.join(SCRIPT_DIR, "data") file_path = os.path.join(data_dir, file_name) - # Check if file exists - if os.path.exists(file_path): - logging.info(f"{file_name} already exists. Skipping download.") - return - - # Download and save the file try: - response = requests.get(actual_file_url) + response = requests.get(actual_file_url, timeout=10) if response.status_code == 200: + new_content_hash = calculate_content_hash(response.content) + + # Check all existing versions of the file + existing_files = get_existing_versions(file_path) + for existing_file in existing_files: + existing_file_hash = calculate_file_hash(existing_file) + if existing_file_hash == new_content_hash: + logging.info(f"An identical file already exists as {existing_file}. Skipping download.") + return + + # No identical file found, save as a new version os.makedirs(data_dir, exist_ok=True) + if os.path.exists(file_path): + # If base file exists, find a unique file name with suffix + file_path = get_unique_file_path(file_path) with open(file_path, "wb") as file: file.write(response.content) logging.info(f"File downloaded and saved to {file_path}") else: logging.error(f"Failed to download the file. Status code: {response.status_code}") - except Exception as e: + except requests.exceptions.Timeout: + logging.error("Request timed out while downloading PDF.") + except requests.exceptions.RequestException as e: logging.error(f"An error occurred during file download: {e}") def process_email(service, message): @@ -84,31 +143,36 @@ def process_email(service, message): except Exception: continue - # Parse HTML and find the PDF link soup = BeautifulSoup(mail_body, "html.parser") link = soup.find("a", string="PDF herunterladen") if link: pdf_link = link.get("href") download_pdf(pdf_link) - # After successful download, delete the email service.users().messages().delete(userId='me', id=message['id']).execute() logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.") else: logging.info("No 'PDF herunterladen' link found in this email.") except Exception as error: - logging.error(f"An error occurred while processing email: {error}") + logging.error(f"An error occurred while processing email ID {message['id']}: {error}") def read_emails(): creds = get_credentials() - service = build('gmail', 'v1', credentials=creds) - messages = fetch_unread_messages(service) + if not creds: + logging.error("No valid credentials found.") + return - if not messages: - logging.info("No new messages.") - else: - for message in messages: - process_email(service, message) + try: + service = build('gmail', 'v1', credentials=creds) + messages = fetch_unread_messages(service) + + if not messages: + logging.info("No new messages.") + else: + for message in messages: + process_email(service, message) + except Exception as e: + logging.error(f"Failed to initialize Gmail service: {e}") if __name__ == "__main__": - read_emails() + read_emails() \ No newline at end of file