batchGame/bin/parseSozialinfo.py
2023-06-05 13:11:47 +02:00

209 lines
6.5 KiB
Python

import asyncio
from bs4 import BeautifulSoup
import hashlib
import soup
from urllib.parse import urlparse
from filter import filter
async def runAddArticles(pageCrawl: str, p, url):
#print(f'launching for page: {pageCrawl}')
pageUrl = url + str(pageCrawl)
s = soup.serve(pageUrl)
content = await s.htmlAsync()
responseFlag = await p.addArticles(content)
return responseFlag
async def runAddArticleContent(url: str, key: str, p):
s = soup.serve(url)
content = await s.htmlAsync()
await p.addArticleContent(content, key)
async def init(filterFile, mode):
url = "https://www.sozialinfo.ch/jobs/erweiterte-suche?extsearch[filter][]=canton:Z%C3%BCrich&extsearch[filter][]=percentOf:0-100&extsearch[page]="
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
pageCrawl = 1
crawlStep = 10
p = parse(baseUrl)
#get all different pages into parser class articleObj
while True:
runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)]
result = await asyncio.gather(*runLst)
pageCrawl += crawlStep
if False in result:
break
#loop through article object make content requests in specific kinds
popFilterArray = []
runLst = []
for key in p.articleObj:
article = p.articleObj[key]
match(filter(article, filterFile)):
case 'all':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'some':
pass
case 'none':
popFilterArray.append(key)
case 'important':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'important'
case 'pinned':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'pinned'
for item in popFilterArray:
p.articleObj.pop(item)
await asyncio.gather(*runLst)
return p.articleObj
class parse:
def __init__(self, baseUrl):
self.baseUrl = baseUrl
self.articleObj = {}
async def addArticles(self, content):
Data = {}
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(id='results-list')
if results is None:
captchaCheck = content.find('Stellenangebote')
if captchaCheck == -1:
type = 'ERROR'
msg = 'Captcha solving needed'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none',
'data': content
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
connectionCheck = content.find('Error: Connection issue')
if connectionCheck != -1:
type = 'ERROR'
msg = 'Connection issue'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return False
if len(results.find_all("article")) <= 0:
return False
for article in results.find_all("article"):
type = article.find_all('div', class_='info-bar')[0].contents[1].text
age = article.find_all('div', class_='info-bar')[0].contents[3].text.split('\n')[1].split()[0]
title = ' '.join(' '.join(article.find_all('div', class_='content-body')[0].find('h3').text.split('\n')).split())
company = article.find_all('div', class_='footer')[0].find_all('span', class_='text-bold')[0].text
id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
href = article.find('a').get('href')
siId = href.split('-')[len(href.split('-'))-1]
lnk = "".join([self.baseUrl, href])
Data[id] = {
'Anstellungsart': type,
'Alter': age,
'Titel': title,
'Firma': company,
'id': id,
'siId': siId,
'href': href,
'lnk': lnk,
'prio': 'normal',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return True
async def addArticleContent(self, content, id):
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(id='jobsummary')
location = ''
if results is None:
return False
if len(results.find_all("li")) <= 0:
return False
listItems = results.find_all("li")
for i in range(len(listItems)):
spans = listItems[i].find_all('span')
for j in range(len(spans)):
if spans[j].text == 'Stellenantritt':
beginn = ' '.join(listItems[i].text.replace('Stellenantritt', '').split())
if spans[j].text == 'Arbeitsort':
try:
location = ' '.join(spans[j+2].text.split())
except:
location = 'none'
results = DOMdocument.find_all(class_='detailDescription')
if len(results[0].find_all("div")) <= 0:
return False
div = results[0].find_all("div")[0]
desc = ''.join(div.text).replace('\n', '')
desc = desc.replace('Stellenbeschrieb', '')
result = DOMdocument.find(id='apply-now')
apply = result.get('href')
results = DOMdocument.find_all(class_='workingAreaLink')
field = []
for i in range(len(results)):
field.append(results[i].get_text())
self.articleObj[id]['content'] = {
'Ort': location,
'Antritt': beginn,
'Beschreibung': desc,
'Anmelden': apply,
'Arbeitsbereich': field,
'Kontaktperson': 'none'
}