将 for 循环添加到工作网络爬虫(Python 和 Beautifulsoup)- pandas技术脚本

Adding a for loop to a working web scraper (Python and Beautifulsoup)

我对 for 循环有疑问，并将其添加到已经工作的网络抓取工具中以运行网页列表。我在看的可能是两三行简单的代码。

我很感激这个问题之前可能已经被问过很多次并得到了回答，但我一直在努力让一些代码为我工作很长一段时间了。我对 Python 比较陌生，希望有所改进。

背景信息：

我已经使用 Python 和 Beautifulsoup 编写了一个网络抓取工具，它能够成功地从 TransferMarkt.com 获取网页并抓取所有需要的网络链接。该脚本由两部分组成：

在第一部分中，我将网页用于足球联赛，
例如英超联赛，并提取所有的网页链接
排名表中的各个团队，并将它们放入列表中。
在我的脚本的第二部分中，我获取了这个单独的团队列表，并进一步提取了每个团队的每个单独球员的信息，然后将它们组合在一起形成一个球员信息的大pandas DataFrame。

我的问题是关于如何在这个网络爬虫的第一部分添加一个 for 循环，不仅可以从一个联赛网页中提取球队链接，还可以从联赛网页列表中提取链接。

下面我包含了一个足球联赛网页的示例、我的网络爬虫代码和输出。

示例：

要抓取的示例网页(英超联赛 - 代码 GB1)：https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/gb1/plus/?saison_id=2019

代码(第 1 部分，共 2 部分)- 从联赛网页上抓取各个球队的链接：

# Python libraries



## Data Preprocessing

import pandas as pd



## Data scraping libraries

from bs4 import BeautifulSoup

import requests





# Assign league by code, e.g. Premier League = 'GB1', to the list_league_selected variable

list_league_selected = 'GB1'





# Assign season by year to season variable e.g. 2014/15 season = 2014

season = '2019'





# Create an empty list to assign these values to

team_links = []





# Web scraper script



## Process League Table

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = 'https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/' + id + '/plus/?saison_id=' + season

tree = requests.get(page, headers = headers)

soup = BeautifulSoup(tree.content, 'html.parser')



## Create an empty list to assign these values to - team_links

team_links = []



## Extract all links with the correct CSS selector

links = soup.select("a.vereinprofil_tooltip")



## We need the location that the link is pointing to, so for each link, take the link location. 

## Additionally, we only need the links in locations 1, 3, 5, etc. of our list, so loop through those only

for i in range(1,59,3):

  team_links.append(links[i].get("href"))



## For each location that we have taken, add the website before it - this allows us to call it later

for i in range(len(team_links)):

  team_links[i] ="https://www.transfermarkt.co.uk" + team_links[i]





# View list of team weblinks assigned to variable - team_links

team_linksteam_links = ['https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2019',

'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2019',

...,

'https://www.transfermarkt.co.uk/sheffield-united/startseite/verein/350/saison_id/2019']

# Create an empty DataFrame for the data, df

df = pd.DataFrame()



# Run the scraper through each of the links in the team_links list

for i in range(len(team_links)):



  # Download and process the team page

  page = team_links[i]

  df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'value']

  pageTree = requests.get(page, headers = headers)

  pageSoup = BeautifulSoup(pageTree.content, 'lxml')



  # Extract all data

  position_number = [item.text for item in pageSoup.select('.items .rn_nummer')]

  position_description = [item.text for item in pageSoup.select('.items td:not([class])')]

  name = [item.text for item in pageSoup.select('.hide-for-small .spielprofil_tooltip')]

  dob = [item.text for item in pageSoup.select('.zentriert:nth-of-type(4):not([id])')]

  nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in pageSoup.select('.zentriert:nth-of-type(5):not([id])')]

  value = [item.text for item in pageSoup.select('.rechts.hauptlink')]

  df_temp = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, value)), columns = df_headers)

  df = df.append(df_temp)  # This last line of code is mine. It appends to temporary data to the master DataFrame, df



# View the pandas DataFrame

dflist_all_leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']  # codes for the top 5 European leagues

import requests

from bs4 import BeautifulSoup

import pandas as pd





headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'

}



leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']





def main(url):

  with requests.Session() as req:

    links = []

    for lea in leagues:

      print(f"Fetching Links from {lea}")

      r = req.get(url.format(lea), headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(

       "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]

      links.extend(link)



    print(f"Collected {len(links)} Links")

    goals = []

    for num, link in enumerate(links):

      print(f"Extracting Page# {num +1}")

      r = req.get(link, headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      target = soup.find("table", class_="items")

      pn = [pn.text for pn in target.select("div.rn_nummer")]

      pos = [pos.text for pos in target.findAll("td", class_=False)]

      name = [name.text for name in target.select("td.hide")]

      dob = [date.find_next(

       "td").text for date in target.select("td.hide")]

      nat = [" /".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(

       "td", itemprop="athlete")]

      val = [val.get_text(strip=True)

         for val in target.select('td.rechts.hauptlink')]

      goal = zip(pn, pos, name, dob, nat, val)

      df = pd.DataFrame(goal, columns=[

               'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])

      goals.append(df)



    new = pd.concat(goals)

    new.to_csv("data.csv", index=False)





main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

输出：

从示例网页中提取的链接(示例网页共 20 个链接，仅显示 4 个)：

# Python libraries



## Data Preprocessing

import pandas as pd



## Data scraping libraries

from bs4 import BeautifulSoup

import requests





# Assign league by code, e.g. Premier League = 'GB1', to the list_league_selected variable

list_league_selected = 'GB1'





# Assign season by year to season variable e.g. 2014/15 season = 2014

season = '2019'





# Create an empty list to assign these values to

team_links = []





# Web scraper script



## Process League Table

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = 'https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/' + id + '/plus/?saison_id=' + season

tree = requests.get(page, headers = headers)

soup = BeautifulSoup(tree.content, 'html.parser')



## Create an empty list to assign these values to - team_links

team_links = []



## Extract all links with the correct CSS selector

links = soup.select("a.vereinprofil_tooltip")



## We need the location that the link is pointing to, so for each link, take the link location. 

## Additionally, we only need the links in locations 1, 3, 5, etc. of our list, so loop through those only

for i in range(1,59,3):

  team_links.append(links[i].get("href"))



## For each location that we have taken, add the website before it - this allows us to call it later

for i in range(len(team_links)):

  team_links[i] ="https://www.transfermarkt.co.uk" + team_links[i]





# View list of team weblinks assigned to variable - team_links

team_linksteam_links = ['https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2019',

'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2019',

...,

'https://www.transfermarkt.co.uk/sheffield-united/startseite/verein/350/saison_id/2019']

# Create an empty DataFrame for the data, df

df = pd.DataFrame()



# Run the scraper through each of the links in the team_links list

for i in range(len(team_links)):



  # Download and process the team page

  page = team_links[i]

  df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'value']

  pageTree = requests.get(page, headers = headers)

  pageSoup = BeautifulSoup(pageTree.content, 'lxml')



  # Extract all data

  position_number = [item.text for item in pageSoup.select('.items .rn_nummer')]

  position_description = [item.text for item in pageSoup.select('.items td:not([class])')]

  name = [item.text for item in pageSoup.select('.hide-for-small .spielprofil_tooltip')]

  dob = [item.text for item in pageSoup.select('.zentriert:nth-of-type(4):not([id])')]

  nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in pageSoup.select('.zentriert:nth-of-type(5):not([id])')]

  value = [item.text for item in pageSoup.select('.rechts.hauptlink')]

  df_temp = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, value)), columns = df_headers)

  df = df.append(df_temp)  # This last line of code is mine. It appends to temporary data to the master DataFrame, df



# View the pandas DataFrame

dflist_all_leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']  # codes for the top 5 European leagues

import requests

from bs4 import BeautifulSoup

import pandas as pd





headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'

}



leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']





def main(url):

  with requests.Session() as req:

    links = []

    for lea in leagues:

      print(f"Fetching Links from {lea}")

      r = req.get(url.format(lea), headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(

       "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]

      links.extend(link)



    print(f"Collected {len(links)} Links")

    goals = []

    for num, link in enumerate(links):

      print(f"Extracting Page# {num +1}")

      r = req.get(link, headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      target = soup.find("table", class_="items")

      pn = [pn.text for pn in target.select("div.rn_nummer")]

      pos = [pos.text for pos in target.findAll("td", class_=False)]

      name = [name.text for name in target.select("td.hide")]

      dob = [date.find_next(

       "td").text for date in target.select("td.hide")]

      nat = [" /".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(

       "td", itemprop="athlete")]

      val = [val.get_text(strip=True)

         for val in target.select('td.rechts.hauptlink')]

      goal = zip(pn, pos, name, dob, nat, val)

      df = pd.DataFrame(goal, columns=[

               'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])

      goals.append(df)



    new = pd.concat(goals)

    new.to_csv("data.csv", index=False)





main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

使用这个团队列表 - team_links，然后我可以使用以下代码进一步提取每个团队的所有玩家的信息。从这个输出中，我可以创建一个包含所有玩家信息的 pandas DataFrame：

代码(第 2 部分，共 2 部分)- 使用 team_links 列表抓取个人球员信息：

# Python libraries



## Data Preprocessing

import pandas as pd



## Data scraping libraries

from bs4 import BeautifulSoup

import requests





# Assign league by code, e.g. Premier League = 'GB1', to the list_league_selected variable

list_league_selected = 'GB1'





# Assign season by year to season variable e.g. 2014/15 season = 2014

season = '2019'





# Create an empty list to assign these values to

team_links = []





# Web scraper script



## Process League Table

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = 'https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/' + id + '/plus/?saison_id=' + season

tree = requests.get(page, headers = headers)

soup = BeautifulSoup(tree.content, 'html.parser')



## Create an empty list to assign these values to - team_links

team_links = []



## Extract all links with the correct CSS selector

links = soup.select("a.vereinprofil_tooltip")



## We need the location that the link is pointing to, so for each link, take the link location. 

## Additionally, we only need the links in locations 1, 3, 5, etc. of our list, so loop through those only

for i in range(1,59,3):

  team_links.append(links[i].get("href"))



## For each location that we have taken, add the website before it - this allows us to call it later

for i in range(len(team_links)):

  team_links[i] ="https://www.transfermarkt.co.uk" + team_links[i]





# View list of team weblinks assigned to variable - team_links

team_linksteam_links = ['https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2019',

'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2019',

...,

'https://www.transfermarkt.co.uk/sheffield-united/startseite/verein/350/saison_id/2019']

# Create an empty DataFrame for the data, df

df = pd.DataFrame()



# Run the scraper through each of the links in the team_links list

for i in range(len(team_links)):



  # Download and process the team page

  page = team_links[i]

  df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'value']

  pageTree = requests.get(page, headers = headers)

  pageSoup = BeautifulSoup(pageTree.content, 'lxml')



  # Extract all data

  position_number = [item.text for item in pageSoup.select('.items .rn_nummer')]

  position_description = [item.text for item in pageSoup.select('.items td:not([class])')]

  name = [item.text for item in pageSoup.select('.hide-for-small .spielprofil_tooltip')]

  dob = [item.text for item in pageSoup.select('.zentriert:nth-of-type(4):not([id])')]

  nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in pageSoup.select('.zentriert:nth-of-type(5):not([id])')]

  value = [item.text for item in pageSoup.select('.rechts.hauptlink')]

  df_temp = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, value)), columns = df_headers)

  df = df.append(df_temp)  # This last line of code is mine. It appends to temporary data to the master DataFrame, df



# View the pandas DataFrame

dflist_all_leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']  # codes for the top 5 European leagues

import requests

from bs4 import BeautifulSoup

import pandas as pd





headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'

}



leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']





def main(url):

  with requests.Session() as req:

    links = []

    for lea in leagues:

      print(f"Fetching Links from {lea}")

      r = req.get(url.format(lea), headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(

       "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]

      links.extend(link)



    print(f"Collected {len(links)} Links")

    goals = []

    for num, link in enumerate(links):

      print(f"Extracting Page# {num +1}")

      r = req.get(link, headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      target = soup.find("table", class_="items")

      pn = [pn.text for pn in target.select("div.rn_nummer")]

      pos = [pos.text for pos in target.findAll("td", class_=False)]

      name = [name.text for name in target.select("td.hide")]

      dob = [date.find_next(

       "td").text for date in target.select("td.hide")]

      nat = [" /".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(

       "td", itemprop="athlete")]

      val = [val.get_text(strip=True)

         for val in target.select('td.rechts.hauptlink')]

      goal = zip(pn, pos, name, dob, nat, val)

      df = pd.DataFrame(goal, columns=[

               'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])

      goals.append(df)



    new = pd.concat(goals)

    new.to_csv("data.csv", index=False)





main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

我的问题 - 添加一个 for 循环来遍历所有联赛：

我需要做的是替换在我的代码的第一部分中分配给单个联赛代码的 list_league_selected 变量，而是使用 for 循环来遍历联赛代码的完整列表 - list_all_leagues。本联赛代码列表如下：

# Python libraries



## Data Preprocessing

import pandas as pd



## Data scraping libraries

from bs4 import BeautifulSoup

import requests





# Assign league by code, e.g. Premier League = 'GB1', to the list_league_selected variable

list_league_selected = 'GB1'





# Assign season by year to season variable e.g. 2014/15 season = 2014

season = '2019'





# Create an empty list to assign these values to

team_links = []





# Web scraper script



## Process League Table

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = 'https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/' + id + '/plus/?saison_id=' + season

tree = requests.get(page, headers = headers)

soup = BeautifulSoup(tree.content, 'html.parser')



## Create an empty list to assign these values to - team_links

team_links = []



## Extract all links with the correct CSS selector

links = soup.select("a.vereinprofil_tooltip")



## We need the location that the link is pointing to, so for each link, take the link location. 

## Additionally, we only need the links in locations 1, 3, 5, etc. of our list, so loop through those only

for i in range(1,59,3):

  team_links.append(links[i].get("href"))



## For each location that we have taken, add the website before it - this allows us to call it later

for i in range(len(team_links)):

  team_links[i] ="https://www.transfermarkt.co.uk" + team_links[i]





# View list of team weblinks assigned to variable - team_links

team_linksteam_links = ['https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2019',

'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2019',

...,

'https://www.transfermarkt.co.uk/sheffield-united/startseite/verein/350/saison_id/2019']

# Create an empty DataFrame for the data, df

df = pd.DataFrame()



# Run the scraper through each of the links in the team_links list

for i in range(len(team_links)):



  # Download and process the team page

  page = team_links[i]

  df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'value']

  pageTree = requests.get(page, headers = headers)

  pageSoup = BeautifulSoup(pageTree.content, 'lxml')



  # Extract all data

  position_number = [item.text for item in pageSoup.select('.items .rn_nummer')]

  position_description = [item.text for item in pageSoup.select('.items td:not([class])')]

  name = [item.text for item in pageSoup.select('.hide-for-small .spielprofil_tooltip')]

  dob = [item.text for item in pageSoup.select('.zentriert:nth-of-type(4):not([id])')]

  nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in pageSoup.select('.zentriert:nth-of-type(5):not([id])')]

  value = [item.text for item in pageSoup.select('.rechts.hauptlink')]

  df_temp = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, value)), columns = df_headers)

  df = df.append(df_temp)  # This last line of code is mine. It appends to temporary data to the master DataFrame, df



# View the pandas DataFrame

dflist_all_leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']  # codes for the top 5 European leagues

import requests

from bs4 import BeautifulSoup

import pandas as pd





headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'

}



leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']





def main(url):

  with requests.Session() as req:

    links = []

    for lea in leagues:

      print(f"Fetching Links from {lea}")

      r = req.get(url.format(lea), headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(

       "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]

      links.extend(link)



    print(f"Collected {len(links)} Links")

    goals = []

    for num, link in enumerate(links):

      print(f"Extracting Page# {num +1}")

      r = req.get(link, headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      target = soup.find("table", class_="items")

      pn = [pn.text for pn in target.select("div.rn_nummer")]

      pos = [pos.text for pos in target.findAll("td", class_=False)]

      name = [name.text for name in target.select("td.hide")]

      dob = [date.find_next(

       "td").text for date in target.select("td.hide")]

      nat = [" /".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(

       "td", itemprop="athlete")]

      val = [val.get_text(strip=True)

         for val in target.select('td.rechts.hauptlink')]

      goal = zip(pn, pos, name, dob, nat, val)

      df = pd.DataFrame(goal, columns=[

               'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])

      goals.append(df)



    new = pd.concat(goals)

    new.to_csv("data.csv", index=False)





main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

我已经阅读了几个解决方案，但我很难让循环正常工作并将团队网页的完整列表附加到正确的部分。我相信我现在真的接近完成我的爬虫了，任何关于如何创建这个 for 循环的建议都将不胜感激！

提前感谢您的帮助！

实际上，我已经花时间清除了您代码中的许多错误。并缩短大路。您可以在下面实现您的目标。

I considered been under antibiotic protection (??) meant under requests.Session() to maintain the Session during my loop, which means to prevent TCP layer security from blocking/refusing/dropping my packet/request while Scraping.

# Python libraries



## Data Preprocessing

import pandas as pd



## Data scraping libraries

from bs4 import BeautifulSoup

import requests





# Assign league by code, e.g. Premier League = 'GB1', to the list_league_selected variable

list_league_selected = 'GB1'





# Assign season by year to season variable e.g. 2014/15 season = 2014

season = '2019'





# Create an empty list to assign these values to

team_links = []





# Web scraper script



## Process League Table

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

page = 'https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/' + id + '/plus/?saison_id=' + season

tree = requests.get(page, headers = headers)

soup = BeautifulSoup(tree.content, 'html.parser')



## Create an empty list to assign these values to - team_links

team_links = []



## Extract all links with the correct CSS selector

links = soup.select("a.vereinprofil_tooltip")



## We need the location that the link is pointing to, so for each link, take the link location. 

## Additionally, we only need the links in locations 1, 3, 5, etc. of our list, so loop through those only

for i in range(1,59,3):

  team_links.append(links[i].get("href"))



## For each location that we have taken, add the website before it - this allows us to call it later

for i in range(len(team_links)):

  team_links[i] ="https://www.transfermarkt.co.uk" + team_links[i]





# View list of team weblinks assigned to variable - team_links

team_linksteam_links = ['https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-liverpool/startseite/verein/31/saison_id/2019',

'https://www.transfermarkt.co.uk/tottenham-hotspur/startseite/verein/148/saison_id/2019',

'https://www.transfermarkt.co.uk/fc-chelsea/startseite/verein/631/saison_id/2019',

...,

'https://www.transfermarkt.co.uk/sheffield-united/startseite/verein/350/saison_id/2019']

# Create an empty DataFrame for the data, df

df = pd.DataFrame()



# Run the scraper through each of the links in the team_links list

for i in range(len(team_links)):



  # Download and process the team page

  page = team_links[i]

  df_headers = ['position_number' , 'position_description' , 'name' , 'dob' , 'nationality' , 'value']

  pageTree = requests.get(page, headers = headers)

  pageSoup = BeautifulSoup(pageTree.content, 'lxml')



  # Extract all data

  position_number = [item.text for item in pageSoup.select('.items .rn_nummer')]

  position_description = [item.text for item in pageSoup.select('.items td:not([class])')]

  name = [item.text for item in pageSoup.select('.hide-for-small .spielprofil_tooltip')]

  dob = [item.text for item in pageSoup.select('.zentriert:nth-of-type(4):not([id])')]

  nationality = ['/'.join([i['title'] for i in item.select('[title]')]) for item in pageSoup.select('.zentriert:nth-of-type(5):not([id])')]

  value = [item.text for item in pageSoup.select('.rechts.hauptlink')]

  df_temp = pd.DataFrame(list(zip(position_number, position_description, name, dob, nationality, value)), columns = df_headers)

  df = df.append(df_temp)  # This last line of code is mine. It appends to temporary data to the master DataFrame, df



# View the pandas DataFrame

dflist_all_leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']  # codes for the top 5 European leagues

import requests

from bs4 import BeautifulSoup

import pandas as pd





headers = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'

}



leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']





def main(url):

  with requests.Session() as req:

    links = []

    for lea in leagues:

      print(f"Fetching Links from {lea}")

      r = req.get(url.format(lea), headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(

       "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]

      links.extend(link)



    print(f"Collected {len(links)} Links")

    goals = []

    for num, link in enumerate(links):

      print(f"Extracting Page# {num +1}")

      r = req.get(link, headers=headers)

      soup = BeautifulSoup(r.content, 'html.parser')

      target = soup.find("table", class_="items")

      pn = [pn.text for pn in target.select("div.rn_nummer")]

      pos = [pos.text for pos in target.findAll("td", class_=False)]

      name = [name.text for name in target.select("td.hide")]

      dob = [date.find_next(

       "td").text for date in target.select("td.hide")]

      nat = [" /".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(

       "td", itemprop="athlete")]

      val = [val.get_text(strip=True)

         for val in target.select('td.rechts.hauptlink')]

      goal = zip(pn, pos, name, dob, nat, val)

      df = pd.DataFrame(goal, columns=[

               'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])

      goals.append(df)



    new = pd.concat(goals)

    new.to_csv("data.csv", index=False)





main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

输出：在线查看

关于 pandas：将 for 循环添加到工作网络爬虫(Python 和 Beautifulsoup)

将 for 循环添加到工作网络爬虫(Python 和 Beautifulsoup)

Adding a for loop to a working web scraper (Python and Beautifulsoup)

相关推荐

Spring部署设置openshift

检查Java中正则表达式中模式的第n次出现

如何让 JTable 停留在已编辑的单元格上

Weblogic 12c 部署

Resteasy Content-Type 默认值

代码不会停止运行，在 Java 中

Out of memory java heap space

Log4j 记录到共享日志文件