使用Python抓取评论最多的新闻和推文

许多网站将以任何技术提供时尚新闻，并且可以通过其评论计数对文章进行评级。假设新闻是关于加密货币的，并且新闻文章是从 cointelegraph.com 上抓取的，我们可以让每个新闻评论者轻松计数并放入 MongoDB 集合中。

需要的模块

Tweepy： Tweepy 是官方 Twitter API 的Python客户端。使用以下 pip 命令安装它：

pip install tweepy

MongoClient： MongoClient 类使您能够成功地与您的代码建立 MongoDB 服务器连接。使用以下 pip 命令安装它：

pip install pymongo

Pyshorteners： Pyshorteners 用于以编程方式缩短、标记、共享或检索链接中的数据。通过以下方式安装

pip install pyshorteners

验证

为了通过 Twitter API 获取推文，需要通过他们的 Twitter 帐户注册一个应用程序。请按照以下步骤操作：

打开此链接 https://apps.twitter.com/ 并单击按钮：“创建新应用”
填写申请详情。您可以将回调 URL 字段留空。
创建应用程序后，您将被重定向到应用程序页面。
打开“密钥和访问令牌”选项卡。
复制“Consumer Key”、“Consumer Secret”、“Access token”和“Access Token Secret”并将它们粘贴到下面的代码中。

下面是实现。

Python3

# Python program to get top 3 trendy news item
  
 
import tweepy
import json
from datetime import date, timedelta, datetime
from pymongo import MongoClient
from html.parser import HTMLParser
import re
from pyshorteners import Shortener
 
 
NewsArrayIndex = 0
NewsArray = [None] * 3
 
class MyHTMLParser(HTMLParser):
     
    # This function collects the value
    # of href and stores in NewsArrayIndex
    # variable
    def handle_starttag(self, tag, attrs):
         
        # Only parse the 'anchor' tag.
        global NewsArrayIndex
        if tag == "a":
            
            # Check the list of defined attributes.
            for name, value in attrs:
                
                # If href is defined, print it.
                if name == "href":
                    
                    # print(value + "\t" + News1)                                  
                    NewsArray[NewsArrayIndex] = value
                    
                    # print(NewsArray)
                    NewsArrayIndex += 1
                     
# This function is the primary place
# to tweet the collected daily news
# News is retrieved from Coll_DailyNewsPlusReview
# collection (MongoDB collection) This collection
# holds the value of " News Headlines, Its review
# count, news link" and based upon the review count,
# top most # reviewed news are taken As twitter allows
# only 280 characters, the retrieved news link got
# shortened by using BITLY API Hashtags related to
# the news are added underneath the retrieved top
# 3 news (All together allowed characters are 280)
# Then top 3 news gets tweeted from a credential
# Finally per day basis the tweeted news are stored
# into another collection for audit purposes as well
# as for weekly posting
def tweetDailyNews():
     
    try:
         
        # This is the collection name in mongodb
        cursor_P = db1.Coll_DailyNewsPlusReview.find({"time": date_str})
               
        p0 = cursor_P[0]
        News = p0.get('News')
        sortedNews = sorted(News, key = lambda x: int(x[1]), reverse = True)
        print(sortedNews[0][0]+"--" + sortedNews[0][1],
              sortedNews[1][0] + ".."+ sortedNews[1][1],
              sortedNews[2][0] + ".." + sortedNews[2][1])
         
        hyperlink_format = '{text}'
        parser = MyHTMLParser()
        dailyNews = "Impactful News of the Day" + "\n"
         
        News0 = sortedNews[0][2]
        parser.feed(hyperlink_format.format(link = News0, text = News0))
         
        News1 = sortedNews[1][2]
        print("News1", News1)
        parser.feed(hyperlink_format.format(link = News1, text = News1))
         
        News2 = sortedNews[2][2]
        print(News2)
        parser.feed(hyperlink_format.format(link = News2, text = News2))
         
        # News shortening pattern
        BITLY_ACCESS_TOKEN ="20dab258cc44c7d017bcd1c1f4b24484a37b8de9"
        b = Shortener(api_key = ACCESS_TOKEN)
         
        NewsArray[0] = re.sub('\n', '', NewsArray[0])
        response1 = b.bitly.short(NewsArray[0])
        response1 = response1['url']
         
        NewsArray[1] = re.sub('\n', '', NewsArray[1])
        response2 = b.bitly.short(NewsArray[1])
        response2 = response2['url']
          
        NewsArray[2] = re.sub('\n', '', NewsArray[2])
        response3 = b.bitly.short(NewsArray[2])
        response3 = response3['url']
         
        news1FewWords = sortedNews[0][0].split()
        dailyNews += news1FewWords[0] + " "
        + news1FewWords[1] + " " + news1FewWords[2]
        + "...." + response1 + "\n"
         
        news2FewWords = sortedNews[1][0].split()
        dailyNews += news2FewWords[0] + " "
        + news2FewWords[1] + " " + news2FewWords[2]
        + "...." + response2+"\n"
         
        news3FewWords = sortedNews[2][0].split()
        dailyNews += news3FewWords[0] + " "
        + news3FewWords[1] + " " + news3FewWords[2]
        + "...." + response3 + "\n" + "# bitcoin \
        # cryptocurrency # blockchain # investor # altcoins\
        # fintech # investment"
        print(dailyNews)
         
        status = api.update_status(status = dailyNews)
        if status:
            for i in range(3):
                datas = {}
                datas['time'] = str(date.today())
                datas['posted_as'] = i
                datas['news'] = sortedNews[i][0]
                datas['shortenedlink'] = NewsArray[i]
                datas['reviewcount'] = sortedNews[i][1]
                datas['link'] = sortedNews[i][2]
                db1.Collection_tweeted_news.insert(datas)
                  
              
    except Exception as e:
        print(e)
        print("Error in getting today news data", str(date_str))
 
         
# Driver Code
News1 = ' '
News2 = ' '
 
date_str = str(date.today())
print("today", date_str)
client = MongoClient('mongodb://localhost:27017/')
 
# Connect your database here
db1 = client.xxxx
 
# credentials to tweet
consumer_key ="xxxx"
consumer_secret ="xxxx"
access_token ="xxxx"
access_token_secret ="xxxx"
  
# authentication of consumer key and secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  
# authentication of access token and secret
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,
                 wait_on_rate_limit = True,
                 wait_on_rate_limit_notify = True)  
     
tweetDailyNews()

输出：

Impactful News of the Day
Veteran Investor Says….https://bit.ly/2X1x51V
Bitcoin Hashrate Drops….https://bit.ly/2T83xyS
The VC Who….https://bit.ly/3czxVKb
#bitcoin #cryptocurrency #blockchain #investor #altcoins #fintech #investment

编程需要懂一点英语