parse(self

NBA

NBAPythonNBA

5Team Per Ganme StatsOpponent Per Game StatsMiscellaneous Stats2015-2016 NBA Schedule and Results2015-162016-2017 NBA Schedule and Results2016-2015

NBA

Team Per Ganme Stats

Opponent Per Game Stats Team Per Game Stats

Miscellaneous StatsAdvanced Stats

Team Per Game StatsOpponent Per Game StatsMiscellaneous StatsNBAAdvanced Stats

2020-2021 NBA Schedule and Results2020-2021 NBA 2020-2021 NBA Schedule and Results 2020-2021 NBA

Vteam: Hteam:

Elo

Team Per Game StatsOpponent Per Game Stats Miscellaneous Stats TO M

TO Elo

import requests import re import csv class NBASpider: def __init__(self): self.url = self.headers = User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 def send(self): response = requests.get(self.url) response.encoding = utf-8 return response.text html def parse(self, html): team_heads, team_datas = self.get_team_info(html) opponent_heads, opponent_datas = self.get_opponent_info(html) return team_heads, team_datas, opponent_heads, opponent_datas def get_team_info(self, html): htmlteam :param html :return: team_heads team_datas 1. table team_table = re.search(table.*?id=per_game-team.*?(.*?)/table, html, re.S).group(1) 2. table team_head = re.search(thead(.*?)/thead, team_table, re.S).group(1) team_heads = re.findall(th.*?(.*?)/th, team_head, re.S) 3. table team_datas = self.get_datas(team_table) return team_heads, team_datas opponent def get_opponent_info(self, html): htmlopponent :param html 1. table opponent_table = re.search(table.*?id=per_game-opponent.*?(.*?)/table, html, re.S).group(1) 2. table opponent_head = re.search(thead(.*?)/thead, opponent_table, re.S).group(1) opponent_heads = re.findall(th.*?(.*?)/th, opponent_head, re.S) 3. table opponent_datas = self.get_datas(opponent_table) return opponent_heads, opponent_datas body def get_datas(self, table_html): tboday :param table_html table :return: tboday = re.search(tbody(.*?)/tbody, table_html, re.S).group(1) contents = re.findall(tr.*?(.*?)/tr, tboday, re.S) for oc in contents: rk = re.findall(th.*?(.*?)/th, oc) datas = re.findall(td.*?(.*?)/td, oc, re.S) datas[0] = re.search(a.*?(.*?)/a, datas[0]).group(1) datas = rk + datas yield datas yield datas csv def save_csv(self, title, heads, rows): f = open(title + .csv, mode=w, encoding=utf-8, newline=) csv_writer = csv.DictWriter(f, fieldnames=heads) csv_writer.writeheader() for row in rows: dict = for i, v in enumerate(heads): dict[v] = row[i] csv_writer.writerow(dict) def crawl(self): 1. res = self.send() 2. team_heads, team_datas, opponent_heads, opponent_datas = self.parse(res) 3. csv self.save_csv(team, team_heads, team_datas) self.save_csv(opponent, opponent_heads, opponent_datas) if __name__ == __main__: spider = NBASpider() spider.crawl()

import requests import re import csv class NBASpider: def __init__(self): self.url = self.schedule_url = self.headers = User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 def send(self, url): response = requests.get(url, headers = self.headers) response.encoding = utf-8 return response.text html def parse(self, html): team_heads, team_datas = self.get_team_info(html) opponent_heads, opponent_datas = self.get_opponent_info(html) return team_heads, team_datas, opponent_heads, opponent_datas def get_team_info(self, html): htmlteam :param html :return: team_heads team_datas 1. table team_table = re.search(table.*?id=per_game-team.*?(.*?)/table, html, re.S).group(1) 2. table team_head = re.search(thead(.*?)/thead, team_table, re.S).group(1) team_heads = re.findall(th.*?(.*?)/th, team_head, re.S) 3. table team_datas = self.get_datas(team_table) return team_heads, team_datas opponent def get_opponent_info(self, html): htmlopponent :param html :return: 1. table opponent_table = re.search(table.*?id=per_game-opponent.*?(.*?)/table, html, re.S).group(1) 2. table opponent_head = re.search(thead(.*?)/thead, opponent_table, re.S).group(1) opponent_heads = re.findall(th.*?(.*?)/th, opponent_head, re.S) 3. table opponent_datas = self.get_datas(opponent_table) return opponent_heads, opponent_datas body def get_datas(self, table_html): tboday :param table_html table :return: tboday = re.search(tbody(.*?)/tbody, table_html, re.S).group(1) contents = re.findall(tr.*?(.*?)/tr, tboday, re.S) for oc in contents: rk = re.findall(th.*?(.*?)/th, oc) datas = re.findall(td.*?(.*?)/td, oc, re.S) datas[0] = re.search(a.*?(.*?)/a, datas[0]).group(1) datas = rk + datas yield datas yield datas def get_schedule_datas(self, table_html): tboday :param table_html table :return: tboday = re.search(tbody(.*?)/tbody, table_html, re.S).group(1) contents = re.findall(tr.*?(.*?)/tr, tboday, re.S) for oc in contents: rk = re.findall(th.*?a.*?(.*?)/a/th, oc) datas = re.findall(td.*?(.*?)/td, oc, re.S) if datas and len(datas) 0: datas[1] = re.search(a.*?(.*?)/a, datas[1]).group(1) datas[3] = re.search(a.*?(.*?)/a, datas[3]).group(1) datas[5] = re.search(a.*?(.*?)/a, datas[5]).group(1) datas = rk + datas yield datas yield datas def parse_schedule_info(self, html): htmlteam :param html :return: team_heads team_datas 1. table table = re.search(table.*?id=schedule data-cols-to-freeze=,1(.*?)/table, html, re.S).group(1) table = table + /tbody 2. table head = re.search(thead(.*?)/thead, table, re.S).group(1) heads = re.findall(th.*?(.*?)/th, head, re.S) 3. table datas = self.get_schedule_datas(table) return heads, datas csv def save_csv(self, title, heads, rows): f = open(title + .csv, mode=w, encoding=utf-8, newline=) csv_writer = csv.DictWriter(f, fieldnames=heads) csv_writer.writeheader() for row in rows: dict = if heads and len(heads) 0: for i, v in enumerate(heads): dict[v] = row[i] if len(row) i else csv_writer.writerow(dict) def crawl_team_opponent(self): 1. res = self.send(self.url) 2. team_heads, team_datas, opponent_heads, opponent_datas = self.parse(res) 3. csv self.save_csv(team, team_heads, team_datas) self.save_csv(opponent, opponent_heads, opponent_datas) def crawl_schedule(self): months = [october, november, december, january, february, march, april, may, june] for month in months: html = self.send(self.schedule_url.format(month)) print(html) heads, datas = self.parse_schedule_info(html) 3. csv self.save_csv(schedule_+month, heads, datas) def crawl(self): self.crawl_schedule() if __name__ == __main__: spider = NBASpider() spider.crawl()

Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181

send(self, url): response = requests.get(url, headers=self.headers, timeout=30

parse(self, html): team_heads, team_datas =

self.get_team_info(html) opponent_heads, opponent_datas =

team_heads, team_datas, opponent_heads, opponent_datas

htmlteam :param html :return: team_heads team_datas

1. table team_table = re.search(

table.*?id=per_game-team.*?(.*?)/table

2. table team_head = re.search(

htmlopponent :param html :return:

1. table opponent_table = re.search(

table.*?id=per_game-opponent.*?(.*?)/table

2. table opponent_head = re.search(

3. table opponent_datas =

tboday :param table_html table :return:

get_schedule_datas(self, table_html):

tboday :param table_html table :return:

th.*?a.*?(.*?)/a/th

get_advanced_team_datas(self, table): trs = table.xpath(

html :param html :return: heads datas

1. table table = re.search(

table.*?id=schedule data-cols-to-freeze=,1(.*?)/table

2. table head = re.search(

xpathhtml :param html :return: heads datas

1. table table = selector.xpath(

self.get_advanced_team_datas(table)

save_csv(self, title, heads, rows): f = open(title +

csv.writer(f) csv_writer.writerow(heads)

rows: csv_writer.writerow(row) f.close()

2. team_heads, team_datas, opponent_heads, opponent_datas =

, team_heads, team_datas) self.save_csv(

self.send(self.schedule_url.format(month))

team

pruneM = M_stat.drop([

indexteam mergeMO = rge(pruneM, pruneO, how =

) newstat = pd.merge(mergeMO, pruneT, how =

elo E1 = 1/(1 + math.pow(10,(R2 – R1)/400

)) E2 = 1/(1 + math.pow(10,(R1 – R2)/400

GenerateTrainData(stat, trainresult):

[[team1team2]…[]…] X =

trainresult.iterrows(): winteam = rows[

stat.loc[winteam].iteritems(): fea_win.append(value)

stat.loc[loseteam].iteritems(): fea_lose.append(value)

team elo win_new_score, lose_new_score =

CalcElo(winteam, loseteam) team_elos[winteam] =

win_new_score team_elos[loseteam] =

nan_to_num(x)0xnaninf

info.loc[team1].iteritems(): fea1.append(value)

info.loc[team2].iteritems(): fea2.append(value)

nan_to_num1array2.X

M_stat = pd.read_csv(folder +

) team_result = pd.read_csv(folder +

PruneData(M_stat, O_stat, T_stat) X,y =

GenerateTrainData(teamstat, team_result)

linear_model.LogisticRegression() limodel.fit(X,y)

print(cross_val_score(model, X, y, cv=10, scoring=

GeneratePredictData(pre_data, teamstat) pre_y =

limodel.predict_proba(pre_X) predictlist =

pre_data.iterrows(): reslt = [rows[

corresponding probability of winning

parselimport

CondaHTTPError:HTTP 000 CONNECTION FAILED for url

httphttpshttpchannels

Loading...

(Just one moment)

parse(self

发表回复取消回复

Loading...

(Just one moment)

发表回复 取消回复

发表回复取消回复