batchGame/bin/parseSozialinfo2.py
2023-06-05 13:11:47 +02:00

248 lines
7.4 KiB
Python

import asyncio
import json
from pprint import pprint
from bs4 import BeautifulSoup
import hashlib
import soup
from urllib.parse import urlparse
from filter import filter
async def runAddArticles(pageCrawl: str, p, url):
#print(f'launching for page: {pageCrawl}')
pageUrl = url + str(pageCrawl)
s = soup.serve(pageUrl)
content = await s.htmlAsync()
responseFlag = await p.addArticles(content)
return responseFlag
async def runAddArticleContent(url: str, key: str, p):
s = soup.serve(url)
content = await s.htmlAsync()
await p.addArticleContent(content, key)
async def init(filterFile, mode):
url = "https://www.sozialinfo.ch/jobs?qualifications%5B%5D=Hochschulbildung&employments%5B%5D=Festanstellung&locations%5B%5D=region.58412&page="
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
pageCrawl = 1
crawlStep = 10
p = parse(baseUrl)
#get all different pages into parser class articleObj
while True:
runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)]
result = await asyncio.gather(*runLst)
pageCrawl += crawlStep
if False in result:
break
#loop through article object make content requests in specific kinds
popFilterArray = []
runLst = []
for key in p.articleObj:
article = p.articleObj[key]
match(filter(article, filterFile)):
case 'all':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'some':
pass
case 'none':
popFilterArray.append(key)
case 'important':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'important'
case 'pinned':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'pinned'
for item in popFilterArray:
p.articleObj.pop(item)
await asyncio.gather(*runLst)
return p.articleObj
class parse:
def __init__(self, baseUrl):
self.baseUrl = baseUrl
self.articleObj = {}
async def addArticles(self, content):
Data = {}
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(id='app')
try:
results = json.loads(results['data-initial-state'])
except:
type = 'ERROR'
msg = 'Couldnt parse app json data'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return False
if results is None:
captchaCheck = content.find('Stellenangeboten')
if captchaCheck == -1:
type = 'ERROR'
msg = 'Captcha solving needed'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none',
'data': content
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
connectionCheck = content.find('Error: Connection issue')
if connectionCheck != -1:
type = 'ERROR'
msg = 'Connection issue'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return False
if len(results) <= 0:
return False
resFlag = False
for i in results:
try:
resultsList = results[i]
for article in resultsList:
type = article['position']
age = ''
try:
age = article['sortingTime'][0:10]
age = age.split("-")
age = '.'.join(age[::-1])
except:
age = article['sortingTime'][0:10]
title = article['title']
company = article['organisation']
id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
href = article['url']
siId = article['uid']
lnk = "".join([self.baseUrl, href])
Data[id] = {
'Anstellungsart': type,
'Alter': age,
'Titel': title,
'Firma': company,
'id': id,
'siId': siId,
'href': href,
'lnk': lnk,
'prio': 'normal',
'content': 'none'
}
resFlag = True
except:
pass
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return resFlag
async def addArticleContent(self, content, id):
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(id='jobsummary')
location = ''
if results is None:
return False
if len(results.find_all("li")) <= 0:
return False
listItems = results.find_all("li")
for i in range(len(listItems)):
spans = listItems[i].find_all('span')
for j in range(len(spans)):
if spans[j].text == 'Stellenantritt':
beginn = ' '.join(listItems[i].text.replace('Stellenantritt', '').split())
if spans[j].text == 'Arbeitsort':
try:
location = ' '.join(spans[j+2].text.split())
except:
location = 'none'
results = DOMdocument.find_all(class_='detailDescription')
if len(results[0].find_all("div")) <= 0:
return False
div = results[0].find_all("div")[0]
desc = ''.join(div.text).replace('\n', '')
desc = desc.replace('Stellenbeschrieb', '')
result = DOMdocument.find(id='apply-now')
apply = result.get('href')
results = DOMdocument.find_all(class_='workingAreaLink')
field = []
for i in range(len(results)):
field.append(results[i].get_text())
self.articleObj[id]['content'] = {
'Ort': location,
'Antritt': beginn,
'Beschreibung': desc,
'Anmelden': apply,
'Arbeitsbereich': field,
'Kontaktperson': 'none'
}