📜  Flutter的网页抓取

📅  最后修改于: 2021-09-23 06:27:30             🧑  作者: Mango

通过访问网页的 HTML 从网页中提取所需数据/信息的过程称为Web ScrapingWeb HarvestingWeb Data Extraction

本文讨论了使用 Flutter 的htmlhttp包进行 Web Scraping 所涉及的步骤。

第 1 步:设置一个新的Flutter应用

通过运行以下命令创建一个新的flutter应用程序:

flutter create YOUR_APP_NAME

  • 在 VS Code 或 Android Studio 中打开应用程序。我正在使用 VS 代码。
  • 打开lib/main.c 文件。 dart文件并清除所有默认代码
  • 添加所需小部件的代码。我将有一个AppBar ,一个包含三个Text小部件的Column ,一个CircularProgressIndicator和一个MaterialButton小部件。
Dart
import 'package:flutter/material.dart';
  
void main() => runApp(MaterialApp(
    theme: ThemeData(
      accentColor: Colors.green,
      scaffoldBackgroundColor: Colors.green[100],
      primaryColor: Colors.green,
    ),
    home: MyApp()));
  
class MyApp extends StatefulWidget {
  const MyApp({Key key}) : super(key: key);
  
  @override
  _MyAppState createState() => _MyAppState();
}
  
class _MyAppState extends State {
    
  // Strings to store the extracted Article titles
  String result1 = 'Result 1';
  String result2 = 'Result 2';
  String result3 = 'Result 3';
    
  // boolean to show CircularProgressIndication
  // while Web Scraping awaits
  bool isLoading = false;
  
  @override
  Widget build(BuildContext context) {
    return Scaffold(
      appBar: AppBar(title: Text('GeeksForGeeks')),
      body: Padding(
        padding: const EdgeInsets.all(16.0),
        child: Center(
            child: Column(
          mainAxisAlignment: MainAxisAlignment.center,
          children: [
              
            // if isLoading is true show loader
            // else show Column of Texts
            isLoading
                ? CircularProgressIndicator()
                : Column(
                    children: [
                      Text(result1,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result2,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result3,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                    ],
                  ),
            SizedBox(height: MediaQuery.of(context).size.height * 0.08),
            MaterialButton(
              onPressed: () {},
              child: Text(
                'Scrap Data',
                style: TextStyle(color: Colors.white),
              ),
              color: Colors.green,
            )
          ],
        )),
      ),
    );
  }
}


Dart
import 'package:flutter/material.dart';
import 'package:html/parser.dart' as parser;
import 'package:http/http.dart' as http;
  
void main() => runApp(MaterialApp(
    theme: ThemeData(
      accentColor: Colors.green,
      scaffoldBackgroundColor: Colors.green[100],
      primaryColor: Colors.green,
    ),
    home: MyApp()));
  
class MyApp extends StatefulWidget {
  const MyApp({Key key}) : super(key: key);
  
  @override
  _MyAppState createState() => _MyAppState();
}
  
class _MyAppState extends State {
    
  // Strings to store the extracted Article titles
  String result1 = 'Result 1';
  String result2 = 'Result 2';
  String result3 = 'Result 3';
    
  // boolean to show CircularProgressIndication
  // while Web Scraping awaits
  bool isLoading = false;
  
  Future> extractData() async {
      
    // Getting the response from the targeted url
    final response =
        await http.Client().get(Uri.parse('https://www.geeksforgeeks.org/'));
      
        // Status Code 200 means response has been received successfully
    if (response.statusCode == 200) {
        
    // Getting the html document from the response
      var document = parser.parse(response.body);
      try {
          
      // Scraping the first article title
        var responseString1 = document
            .getElementsByClassName('articles-list')[0]
            .children[0]
            .children[0]
            .children[0];
  
        print(responseString1.text.trim());
          
      // Scraping the second article title
        var responseString2 = document
            .getElementsByClassName('articles-list')[0]
            .children[1]
            .children[0]
            .children[0];
  
        print(responseString2.text.trim());
          
      // Scraping the third article title
        var responseString3 = document
            .getElementsByClassName('articles-list')[0]
            .children[2]
            .children[0]
            .children[0];
  
        print(responseString3.text.trim());
          
        // Converting the extracted titles into
        // string and returning a list of Strings
        return [
          responseString1.text.trim(),
          responseString2.text.trim(),
          responseString3.text.trim()
        ];
      } catch (e) {
        return ['', '', 'ERROR!'];
      }
    } else {
      return ['', '', 'ERROR: ${response.statusCode}.'];
    }
  }
  
  @override
  Widget build(BuildContext context) {
    return Scaffold(
      appBar: AppBar(title: Text('GeeksForGeeks')),
      body: Padding(
        padding: const EdgeInsets.all(16.0),
        child: Center(
            child: Column(
          mainAxisAlignment: MainAxisAlignment.center,
          children: [
              
            // if isLoading is true show loader
            // else show Column of Texts
            isLoading
                ? CircularProgressIndicator()
                : Column(
                    children: [
                      Text(result1,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result2,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result3,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                    ],
                  ),
            SizedBox(height: MediaQuery.of(context).size.height * 0.08),
            MaterialButton(
             onPressed: () async {
                 
              // Setting isLoading true to show the loader
                setState(() {
                  isLoading = true;
                });
                  
                // Awaiting for web scraping function
                // to return list of strings
                final response = await extractData();
                  
                // Setting the received strings to be
                // displayed and making isLoading false
                // to hide the loader
                setState(() {
                  result1 = response[0];
                  result2 = response[1];
                  result3 = response[2];
                  isLoading = false;
                });
              },
              child: Text(
                'Scrap Data',
                style: TextStyle(color: Colors.white),
              ),
              color: Colors.green,
            )
          ],
        )),
      ),
    );
  }
}


输出 :

第 2 步:添加 HTML 和 HTTP 包。

  • 打开pubspec.yaml文件并在依赖项下添加两行http: ^0.12.0+4html: ^0.14.0+3并使用适当的缩进并保存文件。

  • 然后在您的终端中运行命令:
flutter pub get

  • 打开主。 dart文件并通过在顶部添加这些行来导入包:
import 'package:html/parser.dart' as parser;
import 'package:http/http.dart' as http;

第 3 步:添加 Web Scraping 功能

  • 我将通过其演示 Web Scraping 的网页是https://www.geeksforgeeks.org/ 我们将从文章列表中提取前三篇文章的标题,如下图所示。

  • 现在要提取特定数据,我们首先需要决定一个 父类 具有来自文档其余部分及其子级层次结构的唯一类名,为此我们需要查看页面的HTML文档。我们可以通过在Chrome 浏览器上打开网站,然后右键单击所需文本并单击Inspect 来实现

  • 从上图可以看出,我选择了一个Parent 类,类名= “articles-list” 因为它与文档中所有其他类的名称不同。现在,如果我们查看我们要提取的Children 类,我们可以看到对于第一篇文章的标题,我们需要这种层次结构
  • 同样,对于第二个和第三个标题,它将是:
  • 现在我们有了类名层次结构,我们可以继续编写执行 Web Scraping 的函数:
Future> extractData() async {
//Getting the response from the targeted url
    final response =
        await http.Client().get(Uri.parse('https://www.geeksforgeeks.org/'));
        //Status Code 200 means response has been received successfully
    if (response.statusCode == 200) {
    //Getting the html document from the response
      var document = parser.parse(response.body);
      try {
      //Scraping the first article title
        var responseString1 = document
            .getElementsByClassName('articles-list')[0]
            .children[0]
            .children[0]
            .children[0];

        print(responseString1.text.trim());
        
      //Scraping the second article title
        var responseString2 = document
            .getElementsByClassName('articles-list')[0]
            .children[1]
            .children[0]
            .children[0];

        print(responseString2.text.trim());
        
      //Scraping the third article title
        var responseString3 = document
            .getElementsByClassName('articles-list')[0]
            .children[2]
            .children[0]
            .children[0];

        print(responseString3.text.trim());
     //Converting the extracted titles into string and returning a list of Strings
        return [
          responseString1.text.trim(),
          responseString2.text.trim(),
          responseString3.text.trim()
        ];
      } catch (e) {
        return ['', '', 'ERROR!'];
      }
    } else {
      return ['', '', 'ERROR: ${response.statusCode}.'];
    }
  }
  • 现在我们将在MaterialButtononPressed:参数中调用这个函数并显示CircularProgressIndicator直到它得到结果。
onPressed: () async {
              //Setting isLoading true to show the loader
                setState(() {
                  isLoading = true;
                });
                
                //Awaiting for web scraping function to return list of strings
                final response = await extractData();
                
                //Setting the received strings to be displayed and making isLoading false to hide the loader
                setState(() {
                  result1 = response[0];
                  result2 = response[1];
                  result3 = response[2];
                  isLoading = false;
                });
              }
  • 现在毕竟这是我们的主要。dart看起来像这样:

Dart

import 'package:flutter/material.dart';
import 'package:html/parser.dart' as parser;
import 'package:http/http.dart' as http;
  
void main() => runApp(MaterialApp(
    theme: ThemeData(
      accentColor: Colors.green,
      scaffoldBackgroundColor: Colors.green[100],
      primaryColor: Colors.green,
    ),
    home: MyApp()));
  
class MyApp extends StatefulWidget {
  const MyApp({Key key}) : super(key: key);
  
  @override
  _MyAppState createState() => _MyAppState();
}
  
class _MyAppState extends State {
    
  // Strings to store the extracted Article titles
  String result1 = 'Result 1';
  String result2 = 'Result 2';
  String result3 = 'Result 3';
    
  // boolean to show CircularProgressIndication
  // while Web Scraping awaits
  bool isLoading = false;
  
  Future> extractData() async {
      
    // Getting the response from the targeted url
    final response =
        await http.Client().get(Uri.parse('https://www.geeksforgeeks.org/'));
      
        // Status Code 200 means response has been received successfully
    if (response.statusCode == 200) {
        
    // Getting the html document from the response
      var document = parser.parse(response.body);
      try {
          
      // Scraping the first article title
        var responseString1 = document
            .getElementsByClassName('articles-list')[0]
            .children[0]
            .children[0]
            .children[0];
  
        print(responseString1.text.trim());
          
      // Scraping the second article title
        var responseString2 = document
            .getElementsByClassName('articles-list')[0]
            .children[1]
            .children[0]
            .children[0];
  
        print(responseString2.text.trim());
          
      // Scraping the third article title
        var responseString3 = document
            .getElementsByClassName('articles-list')[0]
            .children[2]
            .children[0]
            .children[0];
  
        print(responseString3.text.trim());
          
        // Converting the extracted titles into
        // string and returning a list of Strings
        return [
          responseString1.text.trim(),
          responseString2.text.trim(),
          responseString3.text.trim()
        ];
      } catch (e) {
        return ['', '', 'ERROR!'];
      }
    } else {
      return ['', '', 'ERROR: ${response.statusCode}.'];
    }
  }
  
  @override
  Widget build(BuildContext context) {
    return Scaffold(
      appBar: AppBar(title: Text('GeeksForGeeks')),
      body: Padding(
        padding: const EdgeInsets.all(16.0),
        child: Center(
            child: Column(
          mainAxisAlignment: MainAxisAlignment.center,
          children: [
              
            // if isLoading is true show loader
            // else show Column of Texts
            isLoading
                ? CircularProgressIndicator()
                : Column(
                    children: [
                      Text(result1,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result2,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                      SizedBox(
                        height: MediaQuery.of(context).size.height * 0.05,
                      ),
                      Text(result3,
                          style: TextStyle(
                              fontSize: 20, fontWeight: FontWeight.bold)),
                    ],
                  ),
            SizedBox(height: MediaQuery.of(context).size.height * 0.08),
            MaterialButton(
             onPressed: () async {
                 
              // Setting isLoading true to show the loader
                setState(() {
                  isLoading = true;
                });
                  
                // Awaiting for web scraping function
                // to return list of strings
                final response = await extractData();
                  
                // Setting the received strings to be
                // displayed and making isLoading false
                // to hide the loader
                setState(() {
                  result1 = response[0];
                  result2 = response[1];
                  result3 = response[2];
                  isLoading = false;
                });
              },
              child: Text(
                'Scrap Data',
                style: TextStyle(color: Colors.white),
              ),
              color: Colors.green,
            )
          ],
        )),
      ),
    );
  }
}

输出:

想要一个更快节奏和更具竞争力的环境来学习 Android 的基础知识吗?
单击此处前往由我们的专家精心策划的指南,旨在让您立即做好行业准备!