import os import base64 import requests import logging import hashlib import re from urllib.parse import parse_qs, urlparse from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from bs4 import BeautifulSoup # Configure logging logging.basicConfig(level=logging.INFO) SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify'] SEARCH_QUERY = 'Amazon Kindle Support' SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) def get_credentials(): creds = None token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json') credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json') if os.path.exists(token_path): creds = Credentials.from_authorized_user_file(token_path, SCOPES) if not creds or not creds.valid: try: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES) creds = flow.run_local_server(port=0) with open(token_path, 'w') as token: token.write(creds.to_json()) except Exception as e: logging.error(f"Failed to obtain credentials: {e}") return None return creds def fetch_unread_messages(service, max_results=10): try: results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute() return results.get('messages', []) except Exception as error: logging.error(f"Error fetching unread messages: {error}") return [] def calculate_file_hash(file_path): """Calculate SHA-256 hash of a file.""" sha256_hash = hashlib.sha256() try: with open(file_path, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) except Exception as e: logging.error(f"Error reading file {file_path} for hashing: {e}") return None return sha256_hash.hexdigest() def calculate_content_hash(content): """Calculate SHA-256 hash of content.""" sha256_hash = hashlib.sha256() sha256_hash.update(content) return sha256_hash.hexdigest() def get_existing_versions(file_path): """Get all existing versions of a file (with suffixes) in the same directory.""" base, ext = os.path.splitext(file_path) dir_path = os.path.dirname(file_path) base_name = os.path.basename(base) existing_files = [] for filename in os.listdir(dir_path): if filename.startswith(base_name) and filename.endswith(ext): # Match files with suffix pattern `-1`, `-2`, etc., or the base file itself if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename): existing_files.append(os.path.join(dir_path, filename)) return existing_files def get_unique_file_path(file_path): """Generate a unique file path by adding a suffix if the file already exists.""" base, ext = os.path.splitext(file_path) counter = 1 new_file_path = f"{base}-{counter}{ext}" while os.path.exists(new_file_path): counter += 1 new_file_path = f"{base}-{counter}{ext}" return new_file_path def download_pdf(pdf_link): parsed_url = urlparse(pdf_link) query_params = parse_qs(parsed_url.query) actual_file_url = query_params.get('U', [None])[0] if not actual_file_url: logging.error("No valid file URL found in PDF link.") return file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf" #file_name = file_name.replace('%20', '-') # Replace '%20' with dashes data_dir = os.path.join(SCRIPT_DIR, "data") file_path = os.path.join(data_dir, file_name) try: response = requests.get(actual_file_url, timeout=10) if response.status_code == 200: new_content_hash = calculate_content_hash(response.content) # Check all existing versions of the file existing_files = get_existing_versions(file_path) for existing_file in existing_files: existing_file_hash = calculate_file_hash(existing_file) if existing_file_hash == new_content_hash: logging.info(f"An identical file already exists as {existing_file}. Skipping download.") return # No identical file found, save as a new version os.makedirs(data_dir, exist_ok=True) if os.path.exists(file_path): # If base file exists, find a unique file name with suffix file_path = get_unique_file_path(file_path) with open(file_path, "wb") as file: file.write(response.content) logging.info(f"File downloaded and saved to {file_path}") else: logging.error(f"Failed to download the file. Status code: {response.status_code}") except requests.exceptions.Timeout: logging.error("Request timed out while downloading PDF.") except requests.exceptions.RequestException as e: logging.error(f"An error occurred during file download: {e}") def process_email(service, message): try: msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute() headers = msg['payload']['headers'] if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'): mail_body = '' for part in msg.get('payload', {}).get('parts', []): try: data = part['body']["data"] byte_code = base64.urlsafe_b64decode(data) mail_body += byte_code.decode("utf-8") except Exception: continue soup = BeautifulSoup(mail_body, "html.parser") link = soup.find("a", string="PDF herunterladen") if link: pdf_link = link.get("href") download_pdf(pdf_link) service.users().messages().delete(userId='me', id=message['id']).execute() logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.") else: logging.info("No 'PDF herunterladen' link found in this email.") except Exception as error: logging.error(f"An error occurred while processing email ID {message['id']}: {error}") def read_emails(): creds = get_credentials() if not creds: logging.error("No valid credentials found.") return try: service = build('gmail', 'v1', credentials=creds) messages = fetch_unread_messages(service) if not messages: logging.info("No new messages.") else: for message in messages: process_email(service, message) except Exception as e: logging.error(f"Failed to initialize Gmail service: {e}") if __name__ == "__main__": read_emails()