248 lines
7.4 KiB
Python
248 lines
7.4 KiB
Python
import asyncio
|
|
import json
|
|
from pprint import pprint
|
|
from bs4 import BeautifulSoup
|
|
import hashlib
|
|
import soup
|
|
from urllib.parse import urlparse
|
|
from filter import filter
|
|
|
|
|
|
|
|
async def runAddArticles(pageCrawl: str, p, url):
|
|
|
|
#print(f'launching for page: {pageCrawl}')
|
|
pageUrl = url + str(pageCrawl)
|
|
s = soup.serve(pageUrl)
|
|
content = await s.htmlAsync()
|
|
responseFlag = await p.addArticles(content)
|
|
return responseFlag
|
|
|
|
async def runAddArticleContent(url: str, key: str, p):
|
|
s = soup.serve(url)
|
|
content = await s.htmlAsync()
|
|
await p.addArticleContent(content, key)
|
|
|
|
|
|
async def init(filterFile, mode):
|
|
url = "https://www.sozialinfo.ch/jobs?qualifications%5B%5D=Hochschulbildung&employments%5B%5D=Festanstellung&locations%5B%5D=region.58412&page="
|
|
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
|
|
pageCrawl = 1
|
|
crawlStep = 10
|
|
|
|
p = parse(baseUrl)
|
|
|
|
#get all different pages into parser class articleObj
|
|
while True:
|
|
runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)]
|
|
result = await asyncio.gather(*runLst)
|
|
pageCrawl += crawlStep
|
|
if False in result:
|
|
break
|
|
|
|
|
|
|
|
#loop through article object make content requests in specific kinds
|
|
|
|
popFilterArray = []
|
|
runLst = []
|
|
for key in p.articleObj:
|
|
article = p.articleObj[key]
|
|
|
|
match(filter(article, filterFile)):
|
|
case 'all':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
case 'some':
|
|
pass
|
|
case 'none':
|
|
popFilterArray.append(key)
|
|
case 'important':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
p.articleObj[key]['prio'] = 'important'
|
|
case 'pinned':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
p.articleObj[key]['prio'] = 'pinned'
|
|
|
|
for item in popFilterArray:
|
|
p.articleObj.pop(item)
|
|
|
|
await asyncio.gather(*runLst)
|
|
|
|
return p.articleObj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class parse:
|
|
|
|
def __init__(self, baseUrl):
|
|
self.baseUrl = baseUrl
|
|
self.articleObj = {}
|
|
|
|
async def addArticles(self, content):
|
|
Data = {}
|
|
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.find(id='app')
|
|
|
|
try:
|
|
results = json.loads(results['data-initial-state'])
|
|
except:
|
|
type = 'ERROR'
|
|
msg = 'Couldnt parse app json data'
|
|
Data['INFO-OBJECT'] = {
|
|
'Titel': type,
|
|
'msg': msg,
|
|
'id': 'INFO-OBJECT',
|
|
'href': self.baseUrl,
|
|
'prio': 'pinned',
|
|
'content': 'none'
|
|
}
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
if results is None:
|
|
|
|
captchaCheck = content.find('Stellenangeboten')
|
|
if captchaCheck == -1:
|
|
type = 'ERROR'
|
|
msg = 'Captcha solving needed'
|
|
Data['INFO-OBJECT'] = {
|
|
'Titel': type,
|
|
'msg': msg,
|
|
'id': 'INFO-OBJECT',
|
|
'href': self.baseUrl,
|
|
'prio': 'pinned',
|
|
'content': 'none',
|
|
'data': content
|
|
}
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
|
|
connectionCheck = content.find('Error: Connection issue')
|
|
if connectionCheck != -1:
|
|
type = 'ERROR'
|
|
msg = 'Connection issue'
|
|
Data['INFO-OBJECT'] = {
|
|
'Titel': type,
|
|
'msg': msg,
|
|
'id': 'INFO-OBJECT',
|
|
'href': self.baseUrl,
|
|
'prio': 'pinned',
|
|
'content': 'none'
|
|
}
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
|
|
return False
|
|
|
|
if len(results) <= 0:
|
|
return False
|
|
|
|
resFlag = False
|
|
for i in results:
|
|
try:
|
|
resultsList = results[i]
|
|
for article in resultsList:
|
|
type = article['position']
|
|
|
|
age = ''
|
|
try:
|
|
age = article['sortingTime'][0:10]
|
|
age = age.split("-")
|
|
age = '.'.join(age[::-1])
|
|
except:
|
|
age = article['sortingTime'][0:10]
|
|
title = article['title']
|
|
company = article['organisation']
|
|
id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
|
|
href = article['url']
|
|
siId = article['uid']
|
|
lnk = "".join([self.baseUrl, href])
|
|
|
|
Data[id] = {
|
|
'Anstellungsart': type,
|
|
'Alter': age,
|
|
'Titel': title,
|
|
'Firma': company,
|
|
'id': id,
|
|
'siId': siId,
|
|
'href': href,
|
|
'lnk': lnk,
|
|
'prio': 'normal',
|
|
'content': 'none'
|
|
}
|
|
|
|
resFlag = True
|
|
except:
|
|
pass
|
|
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
return resFlag
|
|
|
|
async def addArticleContent(self, content, id):
|
|
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.find(id='jobsummary')
|
|
|
|
location = ''
|
|
|
|
if results is None:
|
|
return False
|
|
|
|
if len(results.find_all("li")) <= 0:
|
|
return False
|
|
listItems = results.find_all("li")
|
|
for i in range(len(listItems)):
|
|
spans = listItems[i].find_all('span')
|
|
for j in range(len(spans)):
|
|
if spans[j].text == 'Stellenantritt':
|
|
beginn = ' '.join(listItems[i].text.replace('Stellenantritt', '').split())
|
|
if spans[j].text == 'Arbeitsort':
|
|
try:
|
|
location = ' '.join(spans[j+2].text.split())
|
|
except:
|
|
location = 'none'
|
|
|
|
|
|
results = DOMdocument.find_all(class_='detailDescription')
|
|
if len(results[0].find_all("div")) <= 0:
|
|
return False
|
|
|
|
div = results[0].find_all("div")[0]
|
|
desc = ''.join(div.text).replace('\n', '')
|
|
desc = desc.replace('Stellenbeschrieb', '')
|
|
|
|
|
|
result = DOMdocument.find(id='apply-now')
|
|
apply = result.get('href')
|
|
|
|
|
|
results = DOMdocument.find_all(class_='workingAreaLink')
|
|
field = []
|
|
for i in range(len(results)):
|
|
field.append(results[i].get_text())
|
|
|
|
|
|
self.articleObj[id]['content'] = {
|
|
'Ort': location,
|
|
'Antritt': beginn,
|
|
'Beschreibung': desc,
|
|
'Anmelden': apply,
|
|
'Arbeitsbereich': field,
|
|
'Kontaktperson': 'none'
|
|
}
|