batchGame/bin/main.py
2023-09-05 20:23:01 +02:00

168 lines
3.9 KiB
Python

#!/usr/bin/env python3
'''
Job Site Crawler version 1.1
Sites included: Stadt Zurich/ Sozialinfo
Log:
10.07.2023 - fixed homegate and updated flats page for highlightning
01.06.2023 - sozialinfo changed its results page to a vue.js implementation; rebuilded parser - much easier now
30.05.2023 - added sort by prio
20.05.2023 - finalized INFO_OBJECT
11.05.2023 - added INFO_OBJECT compability to web frontend as modalNotifier
07.05.2023 - added INFO-OBJECT for error msg and captcha check on sozialinfo
by KM maru21
'''
import asyncio
import json
import os
import parseStzh
import parseSozialinfo2
from push import push
from db import db
from pathlib import Path
ntfyLnk = 'https://push.kmuhrer.net'
ntfyTopic = 'sozialinfo'
dataPath = os.path.join(Path(os.path.dirname(__file__)).parent, 'data')
dbFile = dataPath + '/articles.db'
filterFile = dataPath + '/article_filter.conf'
def jsonDump(data, file):
json.dump(data,open(file,'w'),indent=4)
def prioSort(articles):
sortArray1 = []
sortArray2 = []
sortArray3 = []
for article in articles:
if article['prio'] == 'pinned':
sortArray1.append(article)
elif article['prio'] == 'important':
sortArray2.append(article)
else:
sortArray3.append(article)
#return sortArray1 + sortArray2 + sortArray3
sortArray = sortArray3 + sortArray2
return sortArray + sortArray1
def compaireArticles(new, old):
newArticles = []
for key in new:
if not key in old:
#found new
newArticles.append(new[key])
return newArticles
def readDBTable(table):
dbCon = db(dbFile)
try:
jd = json.dumps(dbCon.readAll(table))
except:
return 'none'
return jd
def listDBTable():
dbCon = db(dbFile)
try:
jd = json.dumps(dbCon.readTables())
except:
return 'none'
return jd
async def run():
dbCon = db(dbFile)
ntfy = push(ntfyLnk, ntfyTopic)
#scrape new data from server
currentArticles = json.loads(await main())
#read data from table from previous run
dbNewArticles = []
for line in dbCon.readAll('new_articles'):
dbNewArticles.append(line[1])
#check if an error occured
#check if new articles table exists if not create and exit
#check if there are new articles and replace new articles table
errorCheck = True
if 'INFO-OBJECT' in currentArticles.keys():
if currentArticles['INFO-OBJECT']['Titel'] == 'ERROR':
errorCheck = False
if errorCheck:
if len(dbNewArticles) <= 0:
for index in currentArticles:
if index == 'INFO-OBJECT': continue
article = currentArticles[index]
dbCon.writeNew(article)
exit
if not len(currentArticles) <= 0:
dbCon.delete_table('new_articles')
for index in currentArticles:
if index == 'INFO-OBJECT': continue
article = currentArticles[index]
dbCon.writeNew(article)
#compaire db and current articles
newArticles = compaireArticles(currentArticles, dbNewArticles)
#if there are new articles push them and write permanently to db
if not len(newArticles) <= 0:
#sort new articles by priority
newArticles = prioSort(newArticles)
for article in newArticles:
ntfy.send(article)
if article['id'] == 'INFO-OBJECT': continue
dbCon.write(article)
else:
ntfy.send('none')
async def runAsync(func, ff: str):
return await asyncio.to_thread(func, ff)
async def main(mode = 'full'):
ff = filterFile
parsers = [parseStzh.init(ff, mode), parseSozialinfo2.init(ff, mode)]
articlesStzh, articlesSozialinfo = await asyncio.gather(*parsers)
articles = {**articlesStzh, **articlesSozialinfo}
return json.dumps(articles)
if __name__ == '__main__':
asyncio.run(run())