📜  使用Python Selenium模块下载 Instagram 帖子

📅  最后修改于: 2022-05-13 01:55:11.509000             🧑  作者: Mango

使用Python Selenium模块下载 Instagram 帖子

在本文中,我们将学习如何使用Python Selenium模块下载个人资料的 Instagram 帖子。

要求:

  • 谷歌浏览器或火狐浏览器
  • Chrome 驱动程序(适用于 Google Chrome)或 Gecko 驱动程序(适用于 Mozilla Firefox)
  • Selenium包:它是通过程序控制 Web 浏览器的强大工具。它适用于所有浏览器,适用于所有主要操作系统,其脚本是用各种语言编写的,例如Python、 Java、C# 等。可以使用以下命令安装:
pip install selenium 
  • Beautiful Soap包它是一个Python库,用于从 HTML 和 XML 文件中提取数据。它与您最喜欢的解析器一起工作,提供导航、搜索和修改解析树的惯用方式。它可以使用以下命令安装:
pip install bs4
  • 请求包: 请求库是Python的一个组成部分,用于向指定的 URL 发出 HTTP 请求。可以使用以下命令安装它:
pip install requests

循序渐进的方法:

步骤1:导入模块并输入登录信息以及页面的URL。

Python3
# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
 
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ')
 
# assign URL
url = 'https://instagram.com/' + \
    input('Enter User Name Of User For Downloading Posts ')


Python3
# get URL path
def path():
    global chrome
     
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()


Python3
# extract URL
def url_name(url):
   
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)


Python3
# login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
     
    # finds the username box
    usern = chrome.find_element_by_name("username")
     
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)


Python3
# function to get first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)


Python3
def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1


Python3
# function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name("coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0


Python3
# Function to save content of the current post
def save_content(class_name,img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html,'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
     
    time.sleep(0.9)


Python3
# Function to save multiple posts
def save_multiple(img_name,elem,last_img_flag = False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l,'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
    if last_img_flag:
        user_image = list_images[-1]
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
    if video:
        link = video['src']
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
    with open(img_name, 'wb') as f:
        f.write(response.content)


Python3
# function to check if the post is nested
def nested_check():
   
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0


Python3
# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()


Python3
# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
 
 
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ') 
 
# assign URL
url = 'https://instagram.com/' + \
    input('Enter User Name Of User For Downloading Posts ')
 
# Get URL path
def path():
    global chrome
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()
     
# Extract URL
def url_name(url):
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)
     
# Login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
    # finds the username box
    usern = chrome.find_element_by_name("username")
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)
     
# Function to get content of first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)
     
# Function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name(
            "coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0
       
# Download content of all posts
def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1
 
# Function to save content of the current post
def save_content(class_name, img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html, 'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
     
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
    time.sleep(0.9)
     
# Function to save multiple posts
def save_multiple(img_name, elem, last_img_flag=False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l, 'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
     
    if last_img_flag:
        user_image = list_images[-1]
     
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
     
    if video:
        link = video['src']
     
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
 
# Function to check if the post is nested
def nested_check():
     
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0
 
# Driver Code
path()
time.sleep(1)
 
url_name(url)
 
login(username, password)
 
download_allposts()
 
chrome.close()


第 2 步:启动浏览器新会话的函数。您可能需要将路径添加到 Web 驱动程序。 Chrome()函数,这取决于您的安装。

蟒蛇3

# get URL path
def path():
    global chrome
     
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()

步骤3:输入页面URL的函数。

蟒蛇3

# extract URL
def url_name(url):
   
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)

第 4 步:输入您的登录信息的函数。

蟒蛇3

# login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
     
    # finds the username box
    usern = chrome.find_element_by_name("username")
     
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)

第五步:打开第一篇文章的函数。

蟒蛇3

# function to get first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)

步骤6:下载所有帖子的函数。

蟒蛇3

def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1

第七步:点击下一篇文章的函数。

蟒蛇3

# function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name("coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0

步骤8:保存普通帖子的函数。

蟒蛇3

# Function to save content of the current post
def save_content(class_name,img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html,'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
     
    time.sleep(0.9)

第 9 步:保存嵌套帖子的函数。

蟒蛇3

# Function to save multiple posts
def save_multiple(img_name,elem,last_img_flag = False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l,'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
    if last_img_flag:
        user_image = list_images[-1]
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
    if video:
        link = video['src']
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
    with open(img_name, 'wb') as f:
        f.write(response.content)

第 10 步:检查帖子是否嵌套的函数。

蟒蛇3

# function to check if the post is nested
def nested_check():
   
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0

第 11 步:在驱动程序代码中调用所需的函数。

蟒蛇3

# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()

以下是基于上述方法的完整程序:

蟒蛇3

# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
 
 
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ') 
 
# assign URL
url = 'https://instagram.com/' + \
    input('Enter User Name Of User For Downloading Posts ')
 
# Get URL path
def path():
    global chrome
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()
     
# Extract URL
def url_name(url):
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)
     
# Login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
    # finds the username box
    usern = chrome.find_element_by_name("username")
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)
     
# Function to get content of first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)
     
# Function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name(
            "coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0
       
# Download content of all posts
def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1
 
# Function to save content of the current post
def save_content(class_name, img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html, 'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
     
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
    time.sleep(0.9)
     
# Function to save multiple posts
def save_multiple(img_name, elem, last_img_flag=False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l, 'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
     
    if last_img_flag:
        user_image = list_images[-1]
     
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
     
    if video:
        link = video['src']
     
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
 
# Function to check if the post is nested
def nested_check():
     
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0
 
# Driver Code
path()
time.sleep(1)
 
url_name(url)
 
login(username, password)
 
download_allposts()
 
chrome.close()

运行此完整脚本后,将创建一个包含所有帖子的目录。

输出:

注意:如果您是 Windows 用户,那么帖子将以.file扩展名保存,使用可以打开图像和视频的应用程序打开帖子(Instagram 帖子只有媒体、图像或视频类型)