纽卡VS布莱顿直播_纽卡VS布莱顿免费在线高清直播_纽卡VS布莱顿视频在线观看无插件

发布时间:2026-05-12 23:04:37 | 浏览:
直播信号源

纽卡VS布莱顿直播_纽卡VS布莱顿免费在线高清直播_纽卡VS布莱顿视频在线观看无插件

import requests

import re

import csv

from parsel import Selector

class NBASpider:

def __init__(self):

self.url = "https://www.basketball-reference.com/leagues/NBA_2021.html"

self.schedule_url = "https://www.basketball-reference.com/leagues/NBA_2016_games-{}.html"

self.advanced_team_url = "https://www.basketball-reference.com/leagues/NBA_2016.html"

self.headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 "

"Safari/537.36"

}

# 发送请求,获取数据

def send(self, url):

response = requests.get(url, headers=self.headers, timeout=30)

response.encoding = 'utf-8'

return response.text

# 解析html

def parse(self, html):

team_heads, team_datas = self.get_team_info(html)

opponent_heads, opponent_datas = self.get_opponent_info(html)

return team_heads, team_datas, opponent_heads, opponent_datas

def get_team_info(self, html):

"""

通过正则从获取到的html页面数据中team表的表头和各行数据

:param html 爬取到的页面数据

:return: team_heads表头

team_datas 列表内容

"""

# 1. 正则匹配数据所在的table

team_table = re.search('(.*?)', html, re.S).group(1)

# 2. 正则从table中匹配出表头

team_head = re.search('(.*?)', team_table, re.S).group(1)

team_heads = re.findall('(.*?)', team_head, re.S)

# 3. 正则从table中匹配出表的各行数据

team_datas = self.get_datas(team_table)

return team_heads, team_datas

# 解析opponent数据

def get_opponent_info(self, html):

"""

通过正则从获取到的html页面数据中opponent表的表头和各行数据

:param html 爬取到的页面数据

:return:

"""

# 1. 正则匹配数据所在的table

opponent_table = re.search('(.*?)', html, re.S).group(1)

# 2. 正则从table中匹配出表头

opponent_head = re.search('(.*?)', opponent_table, re.S).group(1)

opponent_heads = re.findall('(.*?)', opponent_head, re.S)

# 3. 正则从table中匹配出表的各行数据

opponent_datas = self.get_datas(opponent_table)

return opponent_heads, opponent_datas

# 获取表格body数据

def get_datas(self, table_html):

"""

从tboday数据中解析出实际数据(去掉页面标签)

:param table_html 解析出来的table数据

:return:

"""

tboday = re.search('(.*?)', table_html, re.S).group(1)

contents = re.findall('(.*?)', tboday, re.S)

for oc in contents:

rk = re.findall('(.*?)', oc)

datas = re.findall('(.*?)', oc, re.S)

datas[0] = re.search('(.*庆南FC赛事推荐?)', datas[0]).group(1)

datas.insert(0, rk[0])

# yield 声明这个方法是一个生成器, 返回的值是datas

yield datas

def get_schedule_datas(self, table_html):

"""

从tboday数据中解析出实际数据(去掉页面标签)

:param table_html 解析出来的table数据

:return:

"""

tboday = re.search('(.*?)', table_html, re.S).group(1)

contents = re.findall('(.*?)', tboday, re.S)

for oc in contents:

rk = re.findall('(.*?)', oc)

datas = re.findall('(.*?)', oc, re.S)

if datas and len(datas) > 0:

datas[1] = re.search('(.*?)', datas[1]).group(1)

datas[3] = re.search('(.*?)', datas[3]).group(1)

datas[5] = re.search('(.*?)', datas[5]).group(1)

datas.insert(0, rk[0])

# yield 声明这个方法是一个生成器, 返回的值是datas

yield datas

def get_advanced_team_datas(self, table):

trs = table.xpath('./tbody/tr')

for tr in trs:

rk = tr.xpath('./th/text()').get()

datas = tr.xpath('./td[@data-stat!="DUMMY"]/text()').getall()

datas[0] = tr.xpath('./td/a/text()').get()

datas.insert(0, rk)

yield datas

def parse_schedule_info(self, html):

"""

通过正则从获取到的html页面数据中的表头和各行数据

:param html 爬取到的页面数据

:return: heads表头

datas 列表内容

"""

# 1. 正则匹配数据所在的table

table = re.search('(.*?)', html, re.S).group(1)

table = table + ""

# 2. 正则从table中匹配出表头

head = re.search('(.*?)', table, re.S).group(1)

heads = re.findall('(.*?)', head, re.S)

# 3. 正则从table中匹配出表的各行数据

datas = self.get_schedule_datas(table)

return heads, datas

def parse_advanced_team(self, html):

"""

通过xpath从获取到的html页面数据中表头和各行数据

:param html 爬取到的页面数据

:return: heads表头

datas 列表内容

"""

selector = Selector(text=html)

# 1. 获取对应的table

table = selector.xpath('//table[@id="advanced-team"]')

# 2. 从table中匹配出表头

res = table.xpath('./thead/tr')[1].xpath('./th/text()').getall()

heads = []

for i, head in enumerate(res):

if 'xa0' in head:

continue

heads.append(head)

# 3. 匹配出表的各行数据

table_data = self.get_advanced_team_datas(table)

return heads, table_data

# 存储成csv文件

def save_csv(self, title, heads, rows):

f = open(title + '.csv', mode='w', encoding='utf-8', newline='')

csv_writer = csv.writer(f)

csv_writer.writerow(heads)

for row in rows:

csv_writer.writerow(row)

f.close()

def crawl_team_opponent(self):

# 1. 发送请求

res = self.send(self.url)

# 2. 解析数据

team_heads, team_datas, opponent_heads, opponent_datas = self.parse(res)

# 3. 保存数据为csv

self.save_csv("team", team_heads, team_datas)

self.save_csv("opponent", opponent_heads, opponent_datas)

def crawl_schedule(self):

months = ["october", "november", "december", "january", "february", "march", "april", "may", "june"]

for month in months:

html = self.send(self.schedule_url.format(month))

# print(html)

heads, datas = self.parse_schedule_info(html)

# 3. 保存数据为csv

self.save_csv("schedule_"+month, heads, datas)

def crawl_advanced_team(self):

# 1. 发送请求

res = self.send(self.advanced_team_url)

# 2. 解析数据

heads, datas = self.parse_advanced_team(res)

# 3. 保存数据为csv

self.save_csv("advanced_team", heads, datas)

def crawl(self):

# 1. 爬取各队伍信息

# self.crawl_team_opponent()

# 2. 爬取计划表

# self.crawl_schedule()

# 3. 爬取Advanced Team表

self.crawl_advanced_team()

if __name__ == '__main__':

# 运行爬虫

spider = NBASpider()

spider.crawl()

标签: