168 lines
3.9 KiB
Python
168 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
|
|
'''
|
|
Job Site Crawler version 1.1
|
|
Sites included: Stadt Zurich/ Sozialinfo
|
|
|
|
Log:
|
|
10.07.2023 - fixed homegate and updated flats page for highlightning
|
|
01.06.2023 - sozialinfo changed its results page to a vue.js implementation; rebuilded parser - much easier now
|
|
30.05.2023 - added sort by prio
|
|
20.05.2023 - finalized INFO_OBJECT
|
|
11.05.2023 - added INFO_OBJECT compability to web frontend as modalNotifier
|
|
07.05.2023 - added INFO-OBJECT for error msg and captcha check on sozialinfo
|
|
|
|
by KM maru21
|
|
'''
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import parseStzh
|
|
import parseSozialinfo2
|
|
|
|
from push import push
|
|
from db import db
|
|
from pathlib import Path
|
|
|
|
ntfyLnk = 'https://push.kmuhrer.net'
|
|
ntfyTopic = 'sozialinfo'
|
|
|
|
dataPath = os.path.join(Path(os.path.dirname(__file__)).parent, 'data')
|
|
dbFile = dataPath + '/articles.db'
|
|
filterFile = dataPath + '/article_filter.conf'
|
|
|
|
|
|
|
|
def jsonDump(data, file):
|
|
json.dump(data,open(file,'w'),indent=4)
|
|
|
|
|
|
|
|
def prioSort(articles):
|
|
sortArray1 = []
|
|
sortArray2 = []
|
|
sortArray3 = []
|
|
for article in articles:
|
|
if article['prio'] == 'pinned':
|
|
sortArray1.append(article)
|
|
elif article['prio'] == 'important':
|
|
sortArray2.append(article)
|
|
else:
|
|
sortArray3.append(article)
|
|
|
|
#return sortArray1 + sortArray2 + sortArray3
|
|
sortArray = sortArray3 + sortArray2
|
|
return sortArray + sortArray1
|
|
|
|
|
|
|
|
def compaireArticles(new, old):
|
|
newArticles = []
|
|
for key in new:
|
|
if not key in old:
|
|
#found new
|
|
newArticles.append(new[key])
|
|
return newArticles
|
|
|
|
|
|
|
|
def readDBTable(table):
|
|
dbCon = db(dbFile)
|
|
try:
|
|
jd = json.dumps(dbCon.readAll(table))
|
|
except:
|
|
return 'none'
|
|
return jd
|
|
|
|
|
|
|
|
def listDBTable():
|
|
dbCon = db(dbFile)
|
|
try:
|
|
jd = json.dumps(dbCon.readTables())
|
|
except:
|
|
return 'none'
|
|
return jd
|
|
|
|
|
|
|
|
async def run():
|
|
|
|
dbCon = db(dbFile)
|
|
ntfy = push(ntfyLnk, ntfyTopic)
|
|
|
|
|
|
#scrape new data from server
|
|
currentArticles = json.loads(await main())
|
|
|
|
|
|
#read data from table from previous run
|
|
dbNewArticles = []
|
|
for line in dbCon.readAll('new_articles'):
|
|
dbNewArticles.append(line[1])
|
|
|
|
|
|
#check if an error occured
|
|
#check if new articles table exists if not create and exit
|
|
#check if there are new articles and replace new articles table
|
|
errorCheck = True
|
|
if 'INFO-OBJECT' in currentArticles.keys():
|
|
if currentArticles['INFO-OBJECT']['Titel'] == 'ERROR':
|
|
errorCheck = False
|
|
if errorCheck:
|
|
if len(dbNewArticles) <= 0:
|
|
for index in currentArticles:
|
|
if index == 'INFO-OBJECT': continue
|
|
article = currentArticles[index]
|
|
dbCon.writeNew(article)
|
|
exit
|
|
|
|
if not len(currentArticles) <= 0:
|
|
dbCon.delete_table('new_articles')
|
|
for index in currentArticles:
|
|
if index == 'INFO-OBJECT': continue
|
|
article = currentArticles[index]
|
|
dbCon.writeNew(article)
|
|
|
|
|
|
#compaire db and current articles
|
|
newArticles = compaireArticles(currentArticles, dbNewArticles)
|
|
|
|
#if there are new articles push them and write permanently to db
|
|
if not len(newArticles) <= 0:
|
|
|
|
#sort new articles by priority
|
|
newArticles = prioSort(newArticles)
|
|
|
|
for article in newArticles:
|
|
ntfy.send(article)
|
|
if article['id'] == 'INFO-OBJECT': continue
|
|
dbCon.write(article)
|
|
else:
|
|
ntfy.send('none')
|
|
|
|
|
|
|
|
async def runAsync(func, ff: str):
|
|
return await asyncio.to_thread(func, ff)
|
|
|
|
|
|
|
|
async def main(mode = 'full'):
|
|
|
|
ff = filterFile
|
|
|
|
parsers = [parseStzh.init(ff, mode), parseSozialinfo2.init(ff, mode)]
|
|
articlesStzh, articlesSozialinfo = await asyncio.gather(*parsers)
|
|
|
|
articles = {**articlesStzh, **articlesSozialinfo}
|
|
|
|
return json.dumps(articles)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(run()) |