#!/usr/bin/env python3 ''' Job Site Crawler version 1.1 Sites included: Stadt Zurich/ Sozialinfo Log: 01.06.2023 - sozialinfo changed its results page to a vue.js implementation; rebuilded parser - much easier now 30.05.2023 - added sort by prio 20.05.2023 - finalized INFO_OBJECT 11.05.2023 - added INFO_OBJECT compability to web frontend as modalNotifier 07.05.2023 - added INFO-OBJECT for error msg and captcha check on sozialinfo by KM maru21 ''' import asyncio import json import os import parseStzh import parseSozialinfo2 from push import push from db import db from pathlib import Path ntfyLnk = 'https://push.kmuhrer.net' ntfyTopic = 'sozialinfo' dataPath = os.path.join(Path(os.path.dirname(__file__)).parent, 'data') dbFile = dataPath + '/articles.db' filterFile = dataPath + '/article_filter.conf' def jsonDump(data, file): json.dump(data,open(file,'w'),indent=4) def prioSort(articles): sortArray1 = [] sortArray2 = [] sortArray3 = [] for article in articles: if article['prio'] == 'pinned': sortArray1.append(article) elif article['prio'] == 'important': sortArray2.append(article) else: sortArray3.append(article) #return sortArray1 + sortArray2 + sortArray3 sortArray = sortArray3 + sortArray2 return sortArray + sortArray1 def compaireArticles(new, old): newArticles = [] for key in new: if not key in old: #found new newArticles.append(new[key]) return newArticles def readDBTable(table): dbCon = db(dbFile) try: jd = json.dumps(dbCon.readAll(table)) except: return 'none' return jd def listDBTable(): dbCon = db(dbFile) try: jd = json.dumps(dbCon.readTables()) except: return 'none' return jd async def run(): dbCon = db(dbFile) ntfy = push(ntfyLnk, ntfyTopic) #scrape new data from server currentArticles = json.loads(await main()) #read data from table from previous run dbNewArticles = [] for line in dbCon.readAll('new_articles'): dbNewArticles.append(line[1]) #check if an error occured #check if new articles table exists if not create and exit #check if there are new articles and replace new articles table errorCheck = True if 'INFO-OBJECT' in currentArticles.keys(): if currentArticles['INFO-OBJECT']['Titel'] == 'ERROR': errorCheck = False if errorCheck: if len(dbNewArticles) <= 0: for index in currentArticles: if index == 'INFO-OBJECT': continue article = currentArticles[index] dbCon.writeNew(article) exit if not len(currentArticles) <= 0: dbCon.delete_table('new_articles') for index in currentArticles: if index == 'INFO-OBJECT': continue article = currentArticles[index] dbCon.writeNew(article) #compaire db and current articles newArticles = compaireArticles(currentArticles, dbNewArticles) #if there are new articles push them and write permanently to db if not len(newArticles) <= 0: #sort new articles by priority newArticles = prioSort(newArticles) for article in newArticles: ntfy.send(article) if article['id'] == 'INFO-OBJECT': continue dbCon.write(article) else: ntfy.send('none') async def runAsync(func, ff: str): return await asyncio.to_thread(func, ff) async def main(mode = 'full'): ff = filterFile parsers = [parseStzh.init(ff, mode), parseSozialinfo2.init(ff, mode)] articlesStzh, articlesSozialinfo = await asyncio.gather(*parsers) articles = {**articlesStzh, **articlesSozialinfo} return json.dumps(articles) if __name__ == '__main__': asyncio.run(run())