来自 Scrapy Python站点的电子邮件 ID 提取器项目

Scrapy 是用Python编写的开源网络爬虫框架，用于网络爬虫，也可用于提取通用数据。首先，所有子页面链接都从主页中获取，然后使用正则表达式从这些子页面中抓取电子邮件 ID。

本文展示了从 geeksforgeeks 站点提取的电子邮件 ID 作为参考。

如何使用 Scrapy 创建电子邮件 ID 提取器项目？

1.安装包- 从终端运行以下命令

pip install scrapy 
pip install scrapy-selenium

2.创建项目——

scrapy startproject projectname (Here projectname is geeksemailtrack) 
cd projectname 
scrapy genspider spidername (Here spidername is emails)

3）在settings.py文件中添加代码使用scrapy-selenium

from shutil import which 
SELENIUM_DRIVER_NAME = 'chrome' 
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver') 
SELENIUM_DRIVER_ARGUMENTS=[]
DOWNLOADER_MIDDLEWARES = { 
'scrapy_selenium.SeleniumMiddleware': 800 
}

4) 现在为您的 chrome 下载 chrome 驱动程序并将其放在您的 chrome scrapy.cfg 文件附近。要下载 chrome 驱动程序，请参阅此站点 - 下载 chrome 驱动程序。

目录结构——

分步代码 -

1. 导入所有需要的库——

Python3

# web scraping framework
import scrapy
  
# for regular expression
import re
  
# for selenium request
from scrapy_selenium import SeleniumRequest
  
# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

Python3

def start_requests(self):
    yield SeleniumRequest(
        url="https://www.geeksforgeeks.org/",
        wait_time=3,
        screenshot=True,
        callback=self.parse,
        dont_filter=True
    )

Python3

def parse(self, response):
        # this helps to get all links from source code
        links = LxmlLinkExtractor(allow=()).extract_links(response)
  
        # Finallinks contains links urk
        Finallinks = [str(link.url) for link in links]
  
        # links list for url that may have email ids
        links = []
  
        # filtering and storing only needed url in links list
        # pages that are about us and contact us are the ones that have email ids
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                links.append(link)
  
        # current page url also added because few sites have email ids on there main page
        links.append(str(response.url))
  
  
  
        # parse_link function is called for extracting email ids
        l = links[0]
        links.pop(0)
  
        # meta helps to transfer links list from parse to parse_link
        yield SeleniumRequest(
            url=l,
            wait_time=3,
            screenshot=True,
            callback=self.parse_link,
            dont_filter=True,
            meta={'links': links}
        )

Python3

def parse_link(self, response):
    # response.meta['links'] this helps to get links list
    links = response.meta['links']
    flag = 0
  
    # links that contains following bad words are discarded
    bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']
  
    for word in bad_words:
        # if any bad word is found in the current page url
        # flag is assigned to 1
        if word in str(response.url):
            flag = 1
            break
  
    # if flag is 1 then no need to get email from
    # that url/page
    if (flag != 1):
        html_text = str(response.text)
        # regular expression used for email id
        email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
        # set of email_list to get unique
        email_list = set(email_list)
        if (len(email_list) != 0):
            for i in email_list:
                # adding email ids to final uniqueemail
                self.uniqueemail.add(i)
  
    # parse_link function is called till
    # if condition satisfy
    # else move to parsed function
    if (len(links) > 0):
        l = links[0]
        links.pop(0)
        yield SeleniumRequest(
            url=l,
            callback=self.parse_link,
            dont_filter=True,
            meta={'links': links}
        )
    else:
        yield SeleniumRequest(
            url=response.url,
            callback=self.parsed,
            dont_filter=True
        )

Python3

def parsed(self, response):
    # emails list of uniqueemail set
    emails = list(self.uniqueemail)
    finalemail = []
  
    for email in emails:
        # avoid garbage value by using '.in' and '.com'
        # and append email ids to finalemail
        if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):
  
            finalemail.append(email)
  
    # final unique email ids from geeksforgeeks site
    print('\n'*2)
    print("Emails scraped", finalemail)
    print('\n'*2)

Python

# web scraping framework
import scrapy
  
# for regular expression
import re
  
# for selenium request
from scrapy_selenium import SeleniumRequest
  
# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
  
class EmailtrackSpider(scrapy.Spider):
    # name of spider
    name = 'emailtrack'
  
    # to have unique email ids
    uniqueemail = set()
  
    # start_requests sends request to given https://www.geeksforgeeks.org/
    # and parse function is called
    def start_requests(self):
        yield SeleniumRequest(
            url="https://www.geeksforgeeks.org/",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )
  
    def parse(self, response):
            # this helps to get all links from source code
            links = LxmlLinkExtractor(allow=()).extract_links(response)
  
            # Finallinks contains links urk
            Finallinks = [str(link.url) for link in links]
  
            # links list for url that may have email ids
            links = []
  
            # filtering and storing only needed url in links list
            # pages that are about us and contact us are the ones that have email ids
            for link in Finallinks:
                if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                    links.append(link)
  
            # current page url also added because few sites have email ids on there main page
            links.append(str(response.url))
  
  
  
            # parse_link function is called for extracting email ids
            l = links[0]
            links.pop(0)
  
            # meta helps to transfer links list from parse to parse_link
            yield SeleniumRequest(
                url=l,
                wait_time=3,
                screenshot=True,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
  
  
    def parse_link(self, response):
  
        # response.meta['links'] this helps to get links list
        links = response.meta['links']
        flag = 0
  
        # links that contains following bad words are discarded
        bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']
  
        for word in bad_words:
            # if any bad word is found in the current page url
            # flag is assigned to 1
            if word in str(response.url):
                flag = 1
                break
  
        # if flag is 1 then no need to get email from
        # that url/page
        if (flag != 1):
            html_text = str(response.text)
            # regular expression used for email id
            email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            # set of email_list to get unique
            email_list = set(email_list)
            if (len(email_list) != 0):
                for i in email_list:
                    # adding email ids to final uniqueemail
                    self.uniqueemail.add(i)
  
        # parse_link function is called till
        # if condition satisfy
        # else move to parsed function
        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(
                url=l,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
        else:
            yield SeleniumRequest(
                url=response.url,
                callback=self.parsed,
                dont_filter=True
            )
  
    def parsed(self, response):
        # emails list of uniqueemail set
        emails = list(self.uniqueemail)
        finalemail = []
  
        for email in emails:
            # avoid garbage value by using '.in' and '.com'
            # and append email ids to finalemail
            if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):
  
                finalemail.append(email)
  
        # final unique email ids from geeksforgeeks site
        print('\n'*2)
        print("Emails scraped", finalemail)
        print('\n'*2)

2. 创建start_requests函数以从selenium访问站点。您可以添加自己的网址。

蟒蛇3

def start_requests(self):
    yield SeleniumRequest(
        url="https://www.geeksforgeeks.org/",
        wait_time=3,
        screenshot=True,
        callback=self.parse,
        dont_filter=True
    )

3. 创建解析函数：

蟒蛇3

def parse(self, response):
        # this helps to get all links from source code
        links = LxmlLinkExtractor(allow=()).extract_links(response)
  
        # Finallinks contains links urk
        Finallinks = [str(link.url) for link in links]
  
        # links list for url that may have email ids
        links = []
  
        # filtering and storing only needed url in links list
        # pages that are about us and contact us are the ones that have email ids
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                links.append(link)
  
        # current page url also added because few sites have email ids on there main page
        links.append(str(response.url))
  
  
  
        # parse_link function is called for extracting email ids
        l = links[0]
        links.pop(0)
  
        # meta helps to transfer links list from parse to parse_link
        yield SeleniumRequest(
            url=l,
            wait_time=3,
            screenshot=True,
            callback=self.parse_link,
            dont_filter=True,
            meta={'links': links}
        )

解析函数说明——

在以下几行中，所有链接均从 https://www.geeksforgeeks.org/ 响应中提取。

links = LxmlLinkExtractor(allow=()).extract_links(response) 
Finallinks = [str(link.url) for link in links]

Finallinks 是包含所有链接的列表。
为了避免不必要的链接，我们设置了过滤器，如果链接属于联系人和关于页面，那么我们只会从该页面抓取详细信息。

for link in Finallinks: 
if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 
or 'CONTACT' in link or 'ABOUT' in 
link): 
links.append(link)

上面的过滤器不是必需的，但站点确实有很多标签（链接），因此，如果站点中有 50 个子页面，那么它将从这 50 个子 URL 中提取电子邮件。假设电子邮件主要在主页、联系页面和关于页面上，因此此过滤器有助于减少抓取那些可能没有电子邮件 ID 的 URL 的时间浪费。
可能有电子邮件 id 的页面链接被一一请求，并使用正则表达式抓取电子邮件 id。

4.创建parse_link函数代码：

蟒蛇3

def parse_link(self, response):
    # response.meta['links'] this helps to get links list
    links = response.meta['links']
    flag = 0
  
    # links that contains following bad words are discarded
    bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']
  
    for word in bad_words:
        # if any bad word is found in the current page url
        # flag is assigned to 1
        if word in str(response.url):
            flag = 1
            break
  
    # if flag is 1 then no need to get email from
    # that url/page
    if (flag != 1):
        html_text = str(response.text)
        # regular expression used for email id
        email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
        # set of email_list to get unique
        email_list = set(email_list)
        if (len(email_list) != 0):
            for i in email_list:
                # adding email ids to final uniqueemail
                self.uniqueemail.add(i)
  
    # parse_link function is called till
    # if condition satisfy
    # else move to parsed function
    if (len(links) > 0):
        l = links[0]
        links.pop(0)
        yield SeleniumRequest(
            url=l,
            callback=self.parse_link,
            dont_filter=True,
            meta={'links': links}
        )
    else:
        yield SeleniumRequest(
            url=response.url,
            callback=self.parsed,
            dont_filter=True
        )

parse_link函数说明：

通过response.text我们得到所请求 URL 的所有源代码。此处使用的正则表达式 '\w+@\w+\.{1}\w+' 可以翻译为类似这样的内容查找以一个或多个字母开头的每一段字符串，后跟一个 at 符号 ('@') , 后跟一个或多个以点结尾的字母。
之后它应该再次有一个或多个字母。它是用于获取电子邮件 ID 的正则表达式。

5. 创建解析函数–

蟒蛇3

def parsed(self, response):
    # emails list of uniqueemail set
    emails = list(self.uniqueemail)
    finalemail = []
  
    for email in emails:
        # avoid garbage value by using '.in' and '.com'
        # and append email ids to finalemail
        if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):
  
            finalemail.append(email)
  
    # final unique email ids from geeksforgeeks site
    print('\n'*2)
    print("Emails scraped", finalemail)
    print('\n'*2)

解析函数的说明：
上面的正则表达式也导致了垃圾值如 select@1.13 在这个从 geeksforgeeks 抓取的电子邮件 ID 中，我们知道 select@1.13 不是电子邮件 ID。解析函数过滤器应用过滤器，只接收包含“.com”和“.in”的电子邮件。

使用以下命令运行蜘蛛 -

scrape crawl spidername (spidername is name of spider)

垃圾邮件中的垃圾值：

最终抓取的电子邮件：

Python

# web scraping framework
import scrapy
  
# for regular expression
import re
  
# for selenium request
from scrapy_selenium import SeleniumRequest
  
# for link extraction
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
  
class EmailtrackSpider(scrapy.Spider):
    # name of spider
    name = 'emailtrack'
  
    # to have unique email ids
    uniqueemail = set()
  
    # start_requests sends request to given https://www.geeksforgeeks.org/
    # and parse function is called
    def start_requests(self):
        yield SeleniumRequest(
            url="https://www.geeksforgeeks.org/",
            wait_time=3,
            screenshot=True,
            callback=self.parse,
            dont_filter=True
        )
  
    def parse(self, response):
            # this helps to get all links from source code
            links = LxmlLinkExtractor(allow=()).extract_links(response)
  
            # Finallinks contains links urk
            Finallinks = [str(link.url) for link in links]
  
            # links list for url that may have email ids
            links = []
  
            # filtering and storing only needed url in links list
            # pages that are about us and contact us are the ones that have email ids
            for link in Finallinks:
                if ('Contact' in link or 'contact' in link or 'About' in link or 'about' in link or 'CONTACT' in link or 'ABOUT' in link):
                    links.append(link)
  
            # current page url also added because few sites have email ids on there main page
            links.append(str(response.url))
  
  
  
            # parse_link function is called for extracting email ids
            l = links[0]
            links.pop(0)
  
            # meta helps to transfer links list from parse to parse_link
            yield SeleniumRequest(
                url=l,
                wait_time=3,
                screenshot=True,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
  
  
    def parse_link(self, response):
  
        # response.meta['links'] this helps to get links list
        links = response.meta['links']
        flag = 0
  
        # links that contains following bad words are discarded
        bad_words = ['facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin']
  
        for word in bad_words:
            # if any bad word is found in the current page url
            # flag is assigned to 1
            if word in str(response.url):
                flag = 1
                break
  
        # if flag is 1 then no need to get email from
        # that url/page
        if (flag != 1):
            html_text = str(response.text)
            # regular expression used for email id
            email_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            # set of email_list to get unique
            email_list = set(email_list)
            if (len(email_list) != 0):
                for i in email_list:
                    # adding email ids to final uniqueemail
                    self.uniqueemail.add(i)
  
        # parse_link function is called till
        # if condition satisfy
        # else move to parsed function
        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(
                url=l,
                callback=self.parse_link,
                dont_filter=True,
                meta={'links': links}
            )
        else:
            yield SeleniumRequest(
                url=response.url,
                callback=self.parsed,
                dont_filter=True
            )
  
    def parsed(self, response):
        # emails list of uniqueemail set
        emails = list(self.uniqueemail)
        finalemail = []
  
        for email in emails:
            # avoid garbage value by using '.in' and '.com'
            # and append email ids to finalemail
            if ('.in' in email or '.com' in email or 'info' in email or 'org' in email):
  
                finalemail.append(email)
  
        # final unique email ids from geeksforgeeks site
        print('\n'*2)
        print("Emails scraped", finalemail)
        print('\n'*2)

以上代码的工作视频 -

参考 -链接提取器