added file appendix -1

This commit is contained in:
maru21 2024-11-21 10:26:13 +01:00
parent 9a769907b4
commit 63e6f2aea9
2 changed files with 94 additions and 208 deletions

View File

@ -1,178 +0,0 @@
import os
import base64
import requests
import logging
import hashlib
import re
from urllib.parse import parse_qs, urlparse
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(level=logging.INFO)
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly', 'https://www.googleapis.com/auth/gmail.modify']
SEARCH_QUERY = 'Amazon Kindle Support'
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
def get_credentials():
creds = None
token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json')
credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json')
if os.path.exists(token_path):
creds = Credentials.from_authorized_user_file(token_path, SCOPES)
if not creds or not creds.valid:
try:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
creds = flow.run_local_server(port=0)
with open(token_path, 'w') as token:
token.write(creds.to_json())
except Exception as e:
logging.error(f"Failed to obtain credentials: {e}")
return None
return creds
def fetch_unread_messages(service, max_results=10):
try:
results = service.users().messages().list(userId='me', labelIds=['INBOX'], q="is:unread", maxResults=max_results).execute()
return results.get('messages', [])
except Exception as error:
logging.error(f"Error fetching unread messages: {error}")
return []
def calculate_file_hash(file_path):
"""Calculate SHA-256 hash of a file."""
sha256_hash = hashlib.sha256()
try:
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
except Exception as e:
logging.error(f"Error reading file {file_path} for hashing: {e}")
return None
return sha256_hash.hexdigest()
def calculate_content_hash(content):
"""Calculate SHA-256 hash of content."""
sha256_hash = hashlib.sha256()
sha256_hash.update(content)
return sha256_hash.hexdigest()
def get_existing_versions(file_path):
"""Get all existing versions of a file (with suffixes) in the same directory."""
base, ext = os.path.splitext(file_path)
dir_path = os.path.dirname(file_path)
base_name = os.path.basename(base)
existing_files = []
for filename in os.listdir(dir_path):
if filename.startswith(base_name) and filename.endswith(ext):
# Match files with suffix pattern `-1`, `-2`, etc., or the base file itself
if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename):
existing_files.append(os.path.join(dir_path, filename))
return existing_files
def get_unique_file_path(file_path):
"""Generate a unique file path by adding a suffix if the file already exists."""
base, ext = os.path.splitext(file_path)
counter = 1
new_file_path = f"{base}-{counter}{ext}"
while os.path.exists(new_file_path):
counter += 1
new_file_path = f"{base}-{counter}{ext}"
return new_file_path
def download_pdf(pdf_link):
parsed_url = urlparse(pdf_link)
query_params = parse_qs(parsed_url.query)
actual_file_url = query_params.get('U', [None])[0]
if not actual_file_url:
logging.error("No valid file URL found in PDF link.")
return
file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
data_dir = os.path.join(SCRIPT_DIR, "data")
file_path = os.path.join(data_dir, file_name)
try:
response = requests.get(actual_file_url, timeout=10)
if response.status_code == 200:
new_content_hash = calculate_content_hash(response.content)
# Check all existing versions of the file
existing_files = get_existing_versions(file_path)
for existing_file in existing_files:
existing_file_hash = calculate_file_hash(existing_file)
if existing_file_hash == new_content_hash:
logging.info(f"An identical file already exists as {existing_file}. Skipping download.")
return
# No identical file found, save as a new version
os.makedirs(data_dir, exist_ok=True)
if os.path.exists(file_path):
# If base file exists, find a unique file name with suffix
file_path = get_unique_file_path(file_path)
with open(file_path, "wb") as file:
file.write(response.content)
logging.info(f"File downloaded and saved to {file_path}")
else:
logging.error(f"Failed to download the file. Status code: {response.status_code}")
except requests.exceptions.Timeout:
logging.error("Request timed out while downloading PDF.")
except requests.exceptions.RequestException as e:
logging.error(f"An error occurred during file download: {e}")
def process_email(service, message):
try:
msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
headers = msg['payload']['headers']
if any(SEARCH_QUERY in header.get('value', '') for header in headers if header['name'] == 'From'):
mail_body = ''
for part in msg.get('payload', {}).get('parts', []):
try:
data = part['body']["data"]
byte_code = base64.urlsafe_b64decode(data)
mail_body += byte_code.decode("utf-8")
except Exception:
continue
soup = BeautifulSoup(mail_body, "html.parser")
link = soup.find("a", string="PDF herunterladen")
if link:
pdf_link = link.get("href")
download_pdf(pdf_link)
service.users().messages().delete(userId='me', id=message['id']).execute()
logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.")
else:
logging.info("No 'PDF herunterladen' link found in this email.")
except Exception as error:
logging.error(f"An error occurred while processing email ID {message['id']}: {error}")
def read_emails():
creds = get_credentials()
if not creds:
logging.error("No valid credentials found.")
return
try:
service = build('gmail', 'v1', credentials=creds)
messages = fetch_unread_messages(service)
if not messages:
logging.info("No new messages.")
else:
for message in messages:
process_email(service, message)
except Exception as e:
logging.error(f"Failed to initialize Gmail service: {e}")
if __name__ == "__main__":
read_emails()

View File

@ -2,6 +2,8 @@ import os
import base64 import base64
import requests import requests
import logging import logging
import hashlib
import re
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from google.auth.transport.requests import Request from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials from google.oauth2.credentials import Credentials
@ -18,16 +20,23 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
def get_credentials(): def get_credentials():
creds = None creds = None
if os.path.exists(SCRIPT_DIR+'/token.json'): token_path = os.getenv("TOKEN_PATH", SCRIPT_DIR+'/token.json')
creds = Credentials.from_authorized_user_file(SCRIPT_DIR+'/token.json', SCOPES) credentials_path = os.getenv("CREDENTIALS_PATH", SCRIPT_DIR+'/credentials.json')
if os.path.exists(token_path):
creds = Credentials.from_authorized_user_file(token_path, SCOPES)
if not creds or not creds.valid: if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token: try:
creds.refresh(Request()) if creds and creds.expired and creds.refresh_token:
else: creds.refresh(Request())
flow = InstalledAppFlow.from_client_secrets_file(SCRIPT_DIR+'/credentials.json', SCOPES) else:
creds = flow.run_local_server(port=0) flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
with open(SCRIPT_DIR+'/token.json', 'w') as token: creds = flow.run_local_server(port=0)
token.write(creds.to_json()) with open(token_path, 'w') as token:
token.write(creds.to_json())
except Exception as e:
logging.error(f"Failed to obtain credentials: {e}")
return None
return creds return creds
def fetch_unread_messages(service, max_results=10): def fetch_unread_messages(service, max_results=10):
@ -38,8 +47,49 @@ def fetch_unread_messages(service, max_results=10):
logging.error(f"Error fetching unread messages: {error}") logging.error(f"Error fetching unread messages: {error}")
return [] return []
def calculate_file_hash(file_path):
"""Calculate SHA-256 hash of a file."""
sha256_hash = hashlib.sha256()
try:
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
except Exception as e:
logging.error(f"Error reading file {file_path} for hashing: {e}")
return None
return sha256_hash.hexdigest()
def calculate_content_hash(content):
"""Calculate SHA-256 hash of content."""
sha256_hash = hashlib.sha256()
sha256_hash.update(content)
return sha256_hash.hexdigest()
def get_existing_versions(file_path):
"""Get all existing versions of a file (with suffixes) in the same directory."""
base, ext = os.path.splitext(file_path)
dir_path = os.path.dirname(file_path)
base_name = os.path.basename(base)
existing_files = []
for filename in os.listdir(dir_path):
if filename.startswith(base_name) and filename.endswith(ext):
# Match files with suffix pattern `-1`, `-2`, etc., or the base file itself
if re.match(rf"{re.escape(base_name)}(-\d+)?{re.escape(ext)}$", filename):
existing_files.append(os.path.join(dir_path, filename))
return existing_files
def get_unique_file_path(file_path):
"""Generate a unique file path by adding a suffix if the file already exists."""
base, ext = os.path.splitext(file_path)
counter = 1
new_file_path = f"{base}-{counter}{ext}"
while os.path.exists(new_file_path):
counter += 1
new_file_path = f"{base}-{counter}{ext}"
return new_file_path
def download_pdf(pdf_link): def download_pdf(pdf_link):
# Parse URL and get the actual file URL
parsed_url = urlparse(pdf_link) parsed_url = urlparse(pdf_link)
query_params = parse_qs(parsed_url.query) query_params = parse_qs(parsed_url.query)
actual_file_url = query_params.get('U', [None])[0] actual_file_url = query_params.get('U', [None])[0]
@ -47,27 +97,36 @@ def download_pdf(pdf_link):
logging.error("No valid file URL found in PDF link.") logging.error("No valid file URL found in PDF link.")
return return
# Extract the file name
file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf" file_name = os.path.basename(urlparse(actual_file_url).path) or "downloaded_file.pdf"
data_dir = os.path.join(SCRIPT_DIR, "data") data_dir = os.path.join(SCRIPT_DIR, "data")
file_path = os.path.join(data_dir, file_name) file_path = os.path.join(data_dir, file_name)
# Check if file exists
if os.path.exists(file_path):
logging.info(f"{file_name} already exists. Skipping download.")
return
# Download and save the file
try: try:
response = requests.get(actual_file_url) response = requests.get(actual_file_url, timeout=10)
if response.status_code == 200: if response.status_code == 200:
new_content_hash = calculate_content_hash(response.content)
# Check all existing versions of the file
existing_files = get_existing_versions(file_path)
for existing_file in existing_files:
existing_file_hash = calculate_file_hash(existing_file)
if existing_file_hash == new_content_hash:
logging.info(f"An identical file already exists as {existing_file}. Skipping download.")
return
# No identical file found, save as a new version
os.makedirs(data_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True)
if os.path.exists(file_path):
# If base file exists, find a unique file name with suffix
file_path = get_unique_file_path(file_path)
with open(file_path, "wb") as file: with open(file_path, "wb") as file:
file.write(response.content) file.write(response.content)
logging.info(f"File downloaded and saved to {file_path}") logging.info(f"File downloaded and saved to {file_path}")
else: else:
logging.error(f"Failed to download the file. Status code: {response.status_code}") logging.error(f"Failed to download the file. Status code: {response.status_code}")
except Exception as e: except requests.exceptions.Timeout:
logging.error("Request timed out while downloading PDF.")
except requests.exceptions.RequestException as e:
logging.error(f"An error occurred during file download: {e}") logging.error(f"An error occurred during file download: {e}")
def process_email(service, message): def process_email(service, message):
@ -84,31 +143,36 @@ def process_email(service, message):
except Exception: except Exception:
continue continue
# Parse HTML and find the PDF link
soup = BeautifulSoup(mail_body, "html.parser") soup = BeautifulSoup(mail_body, "html.parser")
link = soup.find("a", string="PDF herunterladen") link = soup.find("a", string="PDF herunterladen")
if link: if link:
pdf_link = link.get("href") pdf_link = link.get("href")
download_pdf(pdf_link) download_pdf(pdf_link)
# After successful download, delete the email
service.users().messages().delete(userId='me', id=message['id']).execute() service.users().messages().delete(userId='me', id=message['id']).execute()
logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.") logging.info(f"Email with ID {message['id']} successfully deleted after PDF download.")
else: else:
logging.info("No 'PDF herunterladen' link found in this email.") logging.info("No 'PDF herunterladen' link found in this email.")
except Exception as error: except Exception as error:
logging.error(f"An error occurred while processing email: {error}") logging.error(f"An error occurred while processing email ID {message['id']}: {error}")
def read_emails(): def read_emails():
creds = get_credentials() creds = get_credentials()
service = build('gmail', 'v1', credentials=creds) if not creds:
messages = fetch_unread_messages(service) logging.error("No valid credentials found.")
return
if not messages: try:
logging.info("No new messages.") service = build('gmail', 'v1', credentials=creds)
else: messages = fetch_unread_messages(service)
for message in messages:
process_email(service, message) if not messages:
logging.info("No new messages.")
else:
for message in messages:
process_email(service, message)
except Exception as e:
logging.error(f"Failed to initialize Gmail service: {e}")
if __name__ == "__main__": if __name__ == "__main__":
read_emails() read_emails()