使用Python Selenium模块下载 Instagram 帖子
在本文中,我们将学习如何使用Python Selenium模块下载个人资料的 Instagram 帖子。
要求:
- 谷歌浏览器或火狐浏览器
- Chrome 驱动程序(适用于 Google Chrome)或 Gecko 驱动程序(适用于 Mozilla Firefox)
- Selenium包:它是通过程序控制 Web 浏览器的强大工具。它适用于所有浏览器,适用于所有主要操作系统,其脚本是用各种语言编写的,例如Python、 Java、C# 等。可以使用以下命令安装:
pip install selenium
- Beautiful Soap包:它是一个Python库,用于从 HTML 和 XML 文件中提取数据。它与您最喜欢的解析器一起工作,提供导航、搜索和修改解析树的惯用方式。它可以使用以下命令安装:
pip install bs4
- 请求包: 请求库是Python的一个组成部分,用于向指定的 URL 发出 HTTP 请求。可以使用以下命令安装它:
pip install requests
循序渐进的方法:
步骤1:导入模块并输入登录信息以及页面的URL。
Python3
# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ')
# assign URL
url = 'https://instagram.com/' + \
input('Enter User Name Of User For Downloading Posts ')
Python3
# get URL path
def path():
global chrome
# starts a new chrome session
# add path if required
chrome = webdriver.Chrome()
Python3
# extract URL
def url_name(url):
# the web page opens up
chrome.get(url)
# webdriver will wait for 4 sec before throwing a
# NoSuchElement exception so that the element
# is detected and not skipped.
time.sleep(4)
Python3
# login to access post
def login(username, your_password):
log_but = chrome.find_element_by_class_name("L3NKy")
time.sleep(2)
log_but.click()
time.sleep(4)
# finds the username box
usern = chrome.find_element_by_name("username")
# sends the entered username
usern.send_keys(username)
# finds the password box
passw = chrome.find_element_by_name("password")
# sends the entered password
passw.send_keys(your_password)
# sends the enter key
passw.send_keys(Keys.RETURN)
time.sleep(5.5)
# Find Not Now Button
notn = chrome.find_element_by_class_name("yWX7d")
notn.click()
time.sleep(3)
Python3
# function to get first post
def first_post():
pic = chrome.find_element_by_class_name("kIKUG").click()
time.sleep(2)
Python3
def download_allposts():
# open First Post
first_post()
user_name = url.split('/')[-1]
# check if folder corresponding to user name exist or not
if(os.path.isdir(user_name) == False):
# Create folder
os.mkdir(user_name)
# Check if Posts contains multiple images or videos
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
# Function to save nested images
save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
# pass last_img_flag True
save_multiple(user_name+'/'+'content1.' +
str(count_img), elem_img, last_img_flag=1)
else:
save_content('_97aPb', user_name+'/'+'content1')
c = 2
while(True):
next_el = next_post()
if next_el != False:
next_el.click()
time.sleep(1.3)
try:
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
save_multiple(user_name+'/'+'content' +
str(c)+'.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
save_multiple(user_name+'/'+'content'+str(c) +
'.'+str(count_img), elem_img, 1)
else:
save_content('_97aPb', user_name+'/'+'content'+str(c))
except selenium.common.exceptions.NoSuchElementException:
print("finished")
return
else:
break
c += 1
Python3
# function to get next post
def next_post():
try:
nex = chrome.find_element_by_class_name("coreSpriteRightPaginationArrow")
return nex
except selenium.common.exceptions.NoSuchElementException:
return 0
Python3
# Function to save content of the current post
def save_content(class_name,img_name):
time.sleep(0.5)
try:
pic = chrome.find_element_by_class_name(class_name)
except selenium.common.exceptions.NoSuchElementException:
print("Either This user has no images or you haven't followed this user or something went wrong")
return
html = pic.get_attribute('innerHTML')
soup = bs(html,'html.parser')
link = soup.find('video')
if link:
link = link['src']
else:
link = soup.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
time.sleep(0.9)
Python3
# Function to save multiple posts
def save_multiple(img_name,elem,last_img_flag = False):
time.sleep(1)
l = elem.get_attribute('innerHTML')
html = bs(l,'html.parser')
biglist = html.find_all('ul')
biglist = biglist[0]
list_images = biglist.find_all('li')
if last_img_flag:
user_image = list_images[-1]
else:
user_image = list_images[(len(list_images)//2)]
video = user_image.find('video')
if video:
link = video['src']
else:
link = user_image.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
Python3
# function to check if the post is nested
def nested_check():
try:
time.sleep(1)
nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron ')
return nes_nex
except selenium.common.exceptions.NoSuchElementException:
return 0
Python3
# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()
Python3
# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ')
# assign URL
url = 'https://instagram.com/' + \
input('Enter User Name Of User For Downloading Posts ')
# Get URL path
def path():
global chrome
# starts a new chrome session
# add path if required
chrome = webdriver.Chrome()
# Extract URL
def url_name(url):
# the web page opens up
chrome.get(url)
# webdriver will wait for 4 sec before throwing a
# NoSuchElement exception so that the element
# is detected and not skipped.
time.sleep(4)
# Login to access post
def login(username, your_password):
log_but = chrome.find_element_by_class_name("L3NKy")
time.sleep(2)
log_but.click()
time.sleep(4)
# finds the username box
usern = chrome.find_element_by_name("username")
# sends the entered username
usern.send_keys(username)
# finds the password box
passw = chrome.find_element_by_name("password")
# sends the entered password
passw.send_keys(your_password)
# sends the enter key
passw.send_keys(Keys.RETURN)
time.sleep(5.5)
# Find Not Now Button
notn = chrome.find_element_by_class_name("yWX7d")
notn.click()
time.sleep(3)
# Function to get content of first post
def first_post():
pic = chrome.find_element_by_class_name("kIKUG").click()
time.sleep(2)
# Function to get next post
def next_post():
try:
nex = chrome.find_element_by_class_name(
"coreSpriteRightPaginationArrow")
return nex
except selenium.common.exceptions.NoSuchElementException:
return 0
# Download content of all posts
def download_allposts():
# open First Post
first_post()
user_name = url.split('/')[-1]
# check if folder corresponding to user name exist or not
if(os.path.isdir(user_name) == False):
# Create folder
os.mkdir(user_name)
# Check if Posts contains multiple images or videos
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
# Function to save nested images
save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
# pass last_img_flag True
save_multiple(user_name+'/'+'content1.' +
str(count_img), elem_img, last_img_flag=1)
else:
save_content('_97aPb', user_name+'/'+'content1')
c = 2
while(True):
next_el = next_post()
if next_el != False:
next_el.click()
time.sleep(1.3)
try:
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
save_multiple(user_name+'/'+'content' +
str(c)+'.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
save_multiple(user_name+'/'+'content'+str(c) +
'.'+str(count_img), elem_img, 1)
else:
save_content('_97aPb', user_name+'/'+'content'+str(c))
except selenium.common.exceptions.NoSuchElementException:
print("finished")
return
else:
break
c += 1
# Function to save content of the current post
def save_content(class_name, img_name):
time.sleep(0.5)
try:
pic = chrome.find_element_by_class_name(class_name)
except selenium.common.exceptions.NoSuchElementException:
print("Either This user has no images or you haven't followed this user or something went wrong")
return
html = pic.get_attribute('innerHTML')
soup = bs(html, 'html.parser')
link = soup.find('video')
if link:
link = link['src']
else:
link = soup.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
time.sleep(0.9)
# Function to save multiple posts
def save_multiple(img_name, elem, last_img_flag=False):
time.sleep(1)
l = elem.get_attribute('innerHTML')
html = bs(l, 'html.parser')
biglist = html.find_all('ul')
biglist = biglist[0]
list_images = biglist.find_all('li')
if last_img_flag:
user_image = list_images[-1]
else:
user_image = list_images[(len(list_images)//2)]
video = user_image.find('video')
if video:
link = video['src']
else:
link = user_image.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
# Function to check if the post is nested
def nested_check():
try:
time.sleep(1)
nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron ')
return nes_nex
except selenium.common.exceptions.NoSuchElementException:
return 0
# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()
第 2 步:启动浏览器新会话的函数。您可能需要将路径添加到 Web 驱动程序。 Chrome()函数,这取决于您的安装。
蟒蛇3
# get URL path
def path():
global chrome
# starts a new chrome session
# add path if required
chrome = webdriver.Chrome()
步骤3:输入页面URL的函数。
蟒蛇3
# extract URL
def url_name(url):
# the web page opens up
chrome.get(url)
# webdriver will wait for 4 sec before throwing a
# NoSuchElement exception so that the element
# is detected and not skipped.
time.sleep(4)
第 4 步:输入您的登录信息的函数。
蟒蛇3
# login to access post
def login(username, your_password):
log_but = chrome.find_element_by_class_name("L3NKy")
time.sleep(2)
log_but.click()
time.sleep(4)
# finds the username box
usern = chrome.find_element_by_name("username")
# sends the entered username
usern.send_keys(username)
# finds the password box
passw = chrome.find_element_by_name("password")
# sends the entered password
passw.send_keys(your_password)
# sends the enter key
passw.send_keys(Keys.RETURN)
time.sleep(5.5)
# Find Not Now Button
notn = chrome.find_element_by_class_name("yWX7d")
notn.click()
time.sleep(3)
第五步:打开第一篇文章的函数。
蟒蛇3
# function to get first post
def first_post():
pic = chrome.find_element_by_class_name("kIKUG").click()
time.sleep(2)
步骤6:下载所有帖子的函数。
蟒蛇3
def download_allposts():
# open First Post
first_post()
user_name = url.split('/')[-1]
# check if folder corresponding to user name exist or not
if(os.path.isdir(user_name) == False):
# Create folder
os.mkdir(user_name)
# Check if Posts contains multiple images or videos
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
# Function to save nested images
save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
# pass last_img_flag True
save_multiple(user_name+'/'+'content1.' +
str(count_img), elem_img, last_img_flag=1)
else:
save_content('_97aPb', user_name+'/'+'content1')
c = 2
while(True):
next_el = next_post()
if next_el != False:
next_el.click()
time.sleep(1.3)
try:
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
save_multiple(user_name+'/'+'content' +
str(c)+'.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
save_multiple(user_name+'/'+'content'+str(c) +
'.'+str(count_img), elem_img, 1)
else:
save_content('_97aPb', user_name+'/'+'content'+str(c))
except selenium.common.exceptions.NoSuchElementException:
print("finished")
return
else:
break
c += 1
第七步:点击下一篇文章的函数。
蟒蛇3
# function to get next post
def next_post():
try:
nex = chrome.find_element_by_class_name("coreSpriteRightPaginationArrow")
return nex
except selenium.common.exceptions.NoSuchElementException:
return 0
步骤8:保存普通帖子的函数。
蟒蛇3
# Function to save content of the current post
def save_content(class_name,img_name):
time.sleep(0.5)
try:
pic = chrome.find_element_by_class_name(class_name)
except selenium.common.exceptions.NoSuchElementException:
print("Either This user has no images or you haven't followed this user or something went wrong")
return
html = pic.get_attribute('innerHTML')
soup = bs(html,'html.parser')
link = soup.find('video')
if link:
link = link['src']
else:
link = soup.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
time.sleep(0.9)
第 9 步:保存嵌套帖子的函数。
蟒蛇3
# Function to save multiple posts
def save_multiple(img_name,elem,last_img_flag = False):
time.sleep(1)
l = elem.get_attribute('innerHTML')
html = bs(l,'html.parser')
biglist = html.find_all('ul')
biglist = biglist[0]
list_images = biglist.find_all('li')
if last_img_flag:
user_image = list_images[-1]
else:
user_image = list_images[(len(list_images)//2)]
video = user_image.find('video')
if video:
link = video['src']
else:
link = user_image.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
第 10 步:检查帖子是否嵌套的函数。
蟒蛇3
# function to check if the post is nested
def nested_check():
try:
time.sleep(1)
nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron ')
return nes_nex
except selenium.common.exceptions.NoSuchElementException:
return 0
第 11 步:在驱动程序代码中调用所需的函数。
蟒蛇3
# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()
以下是基于上述方法的完整程序:
蟒蛇3
# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ')
# assign URL
url = 'https://instagram.com/' + \
input('Enter User Name Of User For Downloading Posts ')
# Get URL path
def path():
global chrome
# starts a new chrome session
# add path if required
chrome = webdriver.Chrome()
# Extract URL
def url_name(url):
# the web page opens up
chrome.get(url)
# webdriver will wait for 4 sec before throwing a
# NoSuchElement exception so that the element
# is detected and not skipped.
time.sleep(4)
# Login to access post
def login(username, your_password):
log_but = chrome.find_element_by_class_name("L3NKy")
time.sleep(2)
log_but.click()
time.sleep(4)
# finds the username box
usern = chrome.find_element_by_name("username")
# sends the entered username
usern.send_keys(username)
# finds the password box
passw = chrome.find_element_by_name("password")
# sends the entered password
passw.send_keys(your_password)
# sends the enter key
passw.send_keys(Keys.RETURN)
time.sleep(5.5)
# Find Not Now Button
notn = chrome.find_element_by_class_name("yWX7d")
notn.click()
time.sleep(3)
# Function to get content of first post
def first_post():
pic = chrome.find_element_by_class_name("kIKUG").click()
time.sleep(2)
# Function to get next post
def next_post():
try:
nex = chrome.find_element_by_class_name(
"coreSpriteRightPaginationArrow")
return nex
except selenium.common.exceptions.NoSuchElementException:
return 0
# Download content of all posts
def download_allposts():
# open First Post
first_post()
user_name = url.split('/')[-1]
# check if folder corresponding to user name exist or not
if(os.path.isdir(user_name) == False):
# Create folder
os.mkdir(user_name)
# Check if Posts contains multiple images or videos
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
# Function to save nested images
save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
# pass last_img_flag True
save_multiple(user_name+'/'+'content1.' +
str(count_img), elem_img, last_img_flag=1)
else:
save_content('_97aPb', user_name+'/'+'content1')
c = 2
while(True):
next_el = next_post()
if next_el != False:
next_el.click()
time.sleep(1.3)
try:
multiple_images = nested_check()
if multiple_images:
nescheck = multiple_images
count_img = 0
while nescheck:
elem_img = chrome.find_element_by_class_name('rQDP3')
save_multiple(user_name+'/'+'content' +
str(c)+'.'+str(count_img), elem_img)
count_img += 1
nescheck.click()
nescheck = nested_check()
save_multiple(user_name+'/'+'content'+str(c) +
'.'+str(count_img), elem_img, 1)
else:
save_content('_97aPb', user_name+'/'+'content'+str(c))
except selenium.common.exceptions.NoSuchElementException:
print("finished")
return
else:
break
c += 1
# Function to save content of the current post
def save_content(class_name, img_name):
time.sleep(0.5)
try:
pic = chrome.find_element_by_class_name(class_name)
except selenium.common.exceptions.NoSuchElementException:
print("Either This user has no images or you haven't followed this user or something went wrong")
return
html = pic.get_attribute('innerHTML')
soup = bs(html, 'html.parser')
link = soup.find('video')
if link:
link = link['src']
else:
link = soup.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
time.sleep(0.9)
# Function to save multiple posts
def save_multiple(img_name, elem, last_img_flag=False):
time.sleep(1)
l = elem.get_attribute('innerHTML')
html = bs(l, 'html.parser')
biglist = html.find_all('ul')
biglist = biglist[0]
list_images = biglist.find_all('li')
if last_img_flag:
user_image = list_images[-1]
else:
user_image = list_images[(len(list_images)//2)]
video = user_image.find('video')
if video:
link = video['src']
else:
link = user_image.find('img')['src']
response = requests.get(link)
with open(img_name, 'wb') as f:
f.write(response.content)
# Function to check if the post is nested
def nested_check():
try:
time.sleep(1)
nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron ')
return nes_nex
except selenium.common.exceptions.NoSuchElementException:
return 0
# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()
运行此完整脚本后,将创建一个包含所有帖子的目录。
输出:
注意:如果您是 Windows 用户,那么帖子将以.file扩展名保存,使用可以打开图像和视频的应用程序打开帖子(Instagram 帖子只有媒体、图像或视频类型)