amznMailConverter/email_pdf_downloader.py

#!/scrapy/venvs/amznMailConverter/bin/python

import os
import base64
import requests
import logging
from urllib.parse import parse_qs, urlparse
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO)

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify']
SEARCH_QUERY = 'Amazon Kindle Support'
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

def get_credentials():
    creds = None
    if os.path.exists(SCRIPT_DIR+'/token.json'):
        creds = Credentials.from_authorized_user_file(SCRIPT_DIR+'/token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(SCRIPT_DIR+'/credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open(SCRIPT_DIR+'/token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

def fetch_unread_messages(service, max_results=10):
    try:
        results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute()
        return results.get('messages', [])
    except Exception as error:
        logging.error(f"Error fetching unread messages: {error}")
        return []

def download_pdf(pdf_link):
    # Parse URL and get the actual file URL
    parsed_url = urlparse(pdf_link)
    query_params = parse_qs(parsed_url.query)
    actual_file_url = query_params.get('U', [None])[0]
    if not actual_file_url:
        logging.error("No valid file URL found in PDF link.")
        return

    # Extract the file name
    file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
    data_dir = os.path.join(SCRIPT_DIR, "data")
    file_path = os.path.join(data_dir, file_name)

    # Check if file exists
    if os.path.exists(file_path):
        logging.info(f"{file_name} already exists. Skipping download.")
        return

    # Download and save the file
    try:
        response = requests.get(actual_file_url)
        if response.status_code == 200:
            os.makedirs(data_dir, exist_ok=True)
            with open(file_path, "wb") as file:
                file.write(response.content)
            logging.info(f"File downloaded and saved to {file_path}")
        else:
            logging.error(f"Failed to download the file. Status code: {response.status_code}")
    except Exception as e:
        logging.error(f"An error occurred during file download: {e}")

def process_email(service, message):
    try:
        msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
        headers = msg['payload']['headers']
        if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'):
            mail_body = ''
            for part in msg.get('payload', {}).get('parts', []):
                try:
                    data = part['body']["data"]
                    byte_code = base64.urlsafe_b64decode(data)
                    mail_body += byte_code.decode("utf-8")
                except Exception:
                    continue

            # Parse HTML and find the PDF link
            soup = BeautifulSoup(mail_body, "html.parser")
            link = soup.find("a", string="PDF herunterladen")
            if link:
                pdf_link = link.get("href")
                download_pdf(pdf_link)
            else:
                logging.info("No 'PDF herunterladen' link found in this email.")
    except Exception as error:
        logging.error(f"An error occurred while processing email: {error}")

def read_emails():
    creds = get_credentials()
    service = build('gmail', 'v1', credentials=creds)
    messages = fetch_unread_messages(service)

    if not messages:
        logging.info("No new messages.")
    else:
        for message in messages:
            process_email(service, message)

if __name__ == "__main__":
    read_emails()