📜  找不到具有您要求的功能的树生成器:lxml.您需要安装解析器库吗? - Python 代码示例

📅  最后修改于: 2022-03-11 14:45:39.832000             🧑  作者: Mango

代码示例4
import pandas as pd
import extruct as ex
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

urls = [
    'https://www.oddsshark.com/nfl/odds',
    'https://www.oddsshark.com/nba/odds'
]

def get_driver():
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    return driver

def get_source(driver, url):
    driver.get(url)
    return driver.page_source

def get_json(source):
    return ex.extract(source, syntaxes=['json-ld'])

def get_next_page(driver, source):
    """IN the event teams are on more than 1 page Parse the page source and
    return the URL for the next page of results.

    :param driver: Selenium webdriver
    :param source: Page source code from Selenium

    :return
        URL of next paginated page
    """

    elements = driver.find_elements_by_xpath('//link[@rel="next"]')
    if elements:
        return driver.find_element_by_xpath('//link[@rel="next"]').get_attribute('href')
    else:
        return ''


df = pd.DataFrame(columns = ['awayTeam', 'homeTeam','location','startDate'])

def save_teams(data, df):
    """Scrape the teams from a schema.org JSON-LD tag and save the contents in
    the df Pandas dataframe.

    :param data: JSON-LD source containing schema.org SportsEvent markup
    :param df: Name of Pandas dataframe to which to append SportsEvent

    :return
        df with teams appended
    """

    for item in data['json-ld']:
        print(item)
        if "SportsEvent" in item: #issue is right here it does not find SportsEvent
            for SportsEvent in item['SportsEvent']:
                #print(item['SportsEvent'])

                row = {
                    'awayTeam': SportsEvent.get('awayTeam', {}).get('name'),
                    'homeTeam': SportsEvent.get('homeTeam', {}).get('name'),
                    'location': SportsEvent.get('location', {}).get('name'),
                    'startDate': SportsEvent.get('startDate')
                    
                    
                }
                print(row)
                df = df.append(row, ignore_index=True)

    return df


for url in urls:
    
    print(url)

    # Save the teams from the first page
    driver = get_driver()
    source = get_source(driver, url)
    json = get_json(source)
    df = save_teams(json, df)

    # Get teams on each paginated page if other pages exists
    next_page = get_next_page(driver, source)
    paginated_urls = []
    paginated_urls.append(next_page)

    if paginated_urls:

        for url in paginated_urls:

            if url:

                #print(next_page)
                driver = get_driver()
                source = get_source(driver, url)
                json = get_json(source)
                df = save_teams(json, df)
                next_page = get_next_page(driver, source)
                paginated_urls.append(next_page)