📅  最后修改于: 2022-03-11 14:45:39.832000             🧑  作者: Mango
import pandas as pd
import extruct as ex
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
urls = [
'https://www.oddsshark.com/nfl/odds',
'https://www.oddsshark.com/nba/odds'
]
def get_driver():
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
return driver
def get_source(driver, url):
driver.get(url)
return driver.page_source
def get_json(source):
return ex.extract(source, syntaxes=['json-ld'])
def get_next_page(driver, source):
"""IN the event teams are on more than 1 page Parse the page source and
return the URL for the next page of results.
:param driver: Selenium webdriver
:param source: Page source code from Selenium
:return
URL of next paginated page
"""
elements = driver.find_elements_by_xpath('//link[@rel="next"]')
if elements:
return driver.find_element_by_xpath('//link[@rel="next"]').get_attribute('href')
else:
return ''
df = pd.DataFrame(columns = ['awayTeam', 'homeTeam','location','startDate'])
def save_teams(data, df):
"""Scrape the teams from a schema.org JSON-LD tag and save the contents in
the df Pandas dataframe.
:param data: JSON-LD source containing schema.org SportsEvent markup
:param df: Name of Pandas dataframe to which to append SportsEvent
:return
df with teams appended
"""
for item in data['json-ld']:
print(item)
if "SportsEvent" in item: #issue is right here it does not find SportsEvent
for SportsEvent in item['SportsEvent']:
#print(item['SportsEvent'])
row = {
'awayTeam': SportsEvent.get('awayTeam', {}).get('name'),
'homeTeam': SportsEvent.get('homeTeam', {}).get('name'),
'location': SportsEvent.get('location', {}).get('name'),
'startDate': SportsEvent.get('startDate')
}
print(row)
df = df.append(row, ignore_index=True)
return df
for url in urls:
print(url)
# Save the teams from the first page
driver = get_driver()
source = get_source(driver, url)
json = get_json(source)
df = save_teams(json, df)
# Get teams on each paginated page if other pages exists
next_page = get_next_page(driver, source)
paginated_urls = []
paginated_urls.append(next_page)
if paginated_urls:
for url in paginated_urls:
if url:
#print(next_page)
driver = get_driver()
source = get_source(driver, url)
json = get_json(source)
df = save_teams(json, df)
next_page = get_next_page(driver, source)
paginated_urls.append(next_page)