Source code for jBScraper
import sys
import requests
import time
from bs4 import BeautifulSoup
# Scrapes Data for https://www.oejv.com/bundesliga/ergebnissescores
# Disclaimer: there are not many tests for null obejects, ...
[docs]class JBScraper:
#
def __inf__(self):
"""
creates an JBScraper Object
"""
pass
[docs] def scrapeJB(self):
"""
Scrapes Data from https://www.oejv.com/bundesliga/ergebnissescores and stores the data collected in a .txt file
"""
#####################################################################
url = "https://www.oejv.com/bundesliga/ergebnissescores/"
placeToStoreResults = '../results/'
startYear = 2011
endYear = 2018
#####################################################################
year = 2011
# for year in range(startYear, endYear): # TODO activate
# Preparing Files
nameBegegnungen = placeToStoreResults + 'Begegnungen' + str(year) + '.txt' # Name of text file coerced with +.txt
fileBegegnungen = open(nameBegegnungen, 'w') # ATTENTION: overwrites file
fileBegegnungen.writelines('id; nameheim; winFirstHalfH; UBWFirstHalfH; winEndH; UBWEndH; nameauswaerts; winFirstHalfA;'
' UBWFirstHalfA; winEndA; UBWEndA\n')
nameEinzelKaempfe = placeToStoreResults + 'EinzelKaempfe' + str(year) + '.txt' # Name of text file coerced with +.txt
fileEinzelKaempfe = open(nameEinzelKaempfe, 'w') # ATTENTION: overwrites file
fileEinzelKaempfe.writelines('id; fightNo; GivennameFirstnameH; YukoH; WazzariH; IpponH; ShidoH; HSMH; WinH; UBWH;'
'GivennameFirstnameA; YukoA; WazzariA; IpponA; ShidoA; HSMA; WinA; UBWA\n')
data = {'jama_saison': year}
r = requests.post('https://www.oejv.com/bundesliga/ergebnissescores/', data=data)
soupFightsOverview = BeautifulSoup(r.content, 'html.parser')
# Where results are saved
urlOfFights = []
links = soupFightsOverview.find_all(onclick="openwin(this.href); return false") # 115 Elements
# TODO Find Roundname (Runde 1, Runde 2, ...) here (siblings)
idOfFights2 = []
for link in links:
idOfFights2.append(link.get("href"))
idOfFights = idOfFights2[1:][::2]
# Url of fights
for link in idOfFights:
urlOfFights.append(url + link)
# Get first element
# TODO muss noch entscheiden bis wohin die erste liga geht
for urlOfFightUnicode in urlOfFights:
urlOfFight = urlOfFightUnicode.encode('ascii', 'ignore')
rFight = requests.get(urlOfFight)
htmlFight = rFight.content
soupSpecificFight = BeautifulSoup(htmlFight, 'html.parser')
begegnungenId = urlOfFight[65:69]
# FILE ERSTELLEN Begegnung hinzufuegen
nameBegegnungen = placeToStoreResults + 'Begegnungen' + str(year) + '.txt' # Name of text file coerced with +.txt
fileBegegnungen = open(nameBegegnungen, 'a') # ATTENTION: overwrites file
print(nameBegegnungen)
if not soupSpecificFight.find_all(colspan='7'):
nameheim = soupSpecificFight.find_all(colspan='8')[0]
else:
nameheim = soupSpecificFight.find_all(colspan='7')[0]
nameheimStr = nameheim.contents[0][7:] # size(HEIM * )== 7
labels = soupSpecificFight.find_all(bgcolor="#bbbfbb") # Find all fights
if year == 2011:
# TODO Zwischenstand richtig berechen bei 2011
zwischenstand = labels[2].find_all(bgcolor="#57a8f7")
endstand = labels[2].find_all(bgcolor="#57a8f7")
else:
zwischenstand = labels[2].find_all(bgcolor="#57a8f7")
endstand = labels[3].find_all(bgcolor="#57a8f7")
# TODO why does this work?
if not soupSpecificFight.find_all(colspan='7'):
nameauswaerts = soupSpecificFight.find_all(colspan='8')[1]
elif len(soupSpecificFight.find_all(colspan='7')) > 1:
nameauswaerts = soupSpecificFight.find_all(colspan='7')[1]
else:
nameauswaerts = soupSpecificFight.find_all(colspan='8')[0]
nameauswaertsStr = nameauswaerts.contents[0][11:] # size(AUSWAERTS * )== 11
kampfzeile = begegnungenId.decode("utf-8") \
+ '; ' + nameheimStr \
+ '; ' + zwischenstand[0].contents[0] \
+ '; ' + zwischenstand[1].contents[0] \
+ '; ' + endstand[0].contents[0] \
+ '; ' + endstand[1].contents[0] \
+ '; ' + nameauswaertsStr \
+ '; ' + zwischenstand[2].contents[0] \
+ '; ' + zwischenstand[3].contents[0] \
+ '; ' + endstand[2].contents[0] \
+ '; ' + endstand[3].contents[0] \
+ '\n'
fileBegegnungen.writelines(kampfzeile.encode("utf-8"))
# EinzelKaempfe hinzufuegen
nameEinzelKaempfe = placeToStoreResults + 'EinzelKaempfe' + str(year) + '.txt'
fileEinzelKaempfe = open(nameEinzelKaempfe, 'a') # ATTENTION: adds to file
einzelkaempfe = soupSpecificFight.find_all(bgcolor="#d5d5d5") # Find all fights
j = 0
kampf = ''
# loops over each fight
for einzelkampf in einzelkaempfe:
j = j + 1
info = einzelkampf.find_all('td')
kampf += begegnungenId.decode("utf-8") + '; ' + str(j)
numberOfColumns = len(info)
maxlength = 18
withYuko = numberOfColumns == maxlength
for i in range(2, numberOfColumns):
if (not withYuko) and (i == 3 or i == 10):
kampf += '; -1'
kampf += '; ' + (info[i].contents[0] if len(info[i]) > 0 else '_________')
kampf += '\n'
# writes data into file
fileEinzelKaempfe.write(kampf.encode("utf-8"))
time.sleep(1)
kampf = ''
fileBegegnungen.close()
fileEinzelKaempfe.close()
if __name__ == "__main__":
jBScraper = JBScraper()
jBScraper.scrapeJB()