📜  HTML 实体解析器

📅  最后修改于: 2021-11-10 04:21:35             🧑  作者: Mango

给定一个字符串str ,其中包含各种 HTML 实体,任务是将这些实体替换为其相应的特殊字符。

下面是 HTML 实体及其相应的特殊字符如下表所示:

Name/ Description HTML Entity Special Character
Space    
Ampersand & &
Greater than > >
Less than < <
Single Quotation Mark '
Double Quotation Mark "
Trademark ® ® Copyright mark © © Forward Slash

例子:

方法一——使用unordered_map:步骤如下:

  1. 将 HTML 实体及其字符在 Map 中。
  2. 遍历给定的字符串,如果遇到任何字符“&” ,则查找此与符号后存在哪个 HTML 实体。
  3. 在输出字符串添加与实体对应的字符。
  4. 打印输出字符串作为结果。

下面是上述方法的实现:

C++
// C++ program for the above approach
#include 
#include 
using namespace std;
  
class GfG {
public:
    unordered_map m;
  
public:
    // Associating html entity with
    // special character
    void initializeMap()
    {
        m["""] = "\"";
        m["'"] = "'";
        m["&"] = "&";
        m[">"] = ">";
        m["<"] = "<";
        m["⁄"] = "/";
        m[" "] = " ";
        m["®"] = "®";
        m["©"] = "©";
    }
  
public:
    // Function that convert the given
    // HTML Entity to its parsed String
    string parseInputString(string input)
    {
        // Output string
        string output = "";
  
        // Traverse the string
        for (int i = 0;
             i < input.size(); i++) {
  
            // If any ampersand is occurred
            if (input[i] == '&') {
  
                string buffer;
  
                while (i < input.size()) {
  
                    buffer = buffer + input[i];
  
                    // If any Entity is found
                    if (input[i] == ';'
                        && m.find(buffer)
                               != m.end()) {
  
                        // Append the parsed
                        // character
                        output = output
                                 + m[buffer];
  
                        // Clear the buffer
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output
                             + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output
                         + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}


C++
// C++ program to Parse the HTML Entities
#include 
using namespace std;
  
class GfG {
  
public:
    string parseInputString(string input)
    {
  
        // To store parsed string
        string output = "";
  
        for (int i = 0;
             i < input.size(); i++) {
  
            // Matching pattern of html
            // entity
            if (input[i] == '&') {
                string buffer;
  
                while (i < input.size()) {
                    buffer = buffer + input[i];
  
                    // Check match for (\)
                    if (input[i] == ';'
                        && buffer == """) {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (')
                    else if (input[i] == ';'
                             && buffer == "'") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (&)
                    else if (input[i] == ';'
                             && buffer == "&") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (>)
                    else if (input[i] == ';'
                             && buffer == ">") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (<)
                    else if (input[i] == ';'
                             && buffer == "<") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (/)
                    else if (input[i] == ';'
                             && buffer == "⁄") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (" ")
                    else if (input[i] == ';'
                             && buffer == " ") {
                        output = output + " ";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (®)
                    else if (input[i] == ';'
                             && buffer == "®") {
                        output = output + "®";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (©)
                    else if (input[i] == ';'
                             && buffer == "©") {
                        output = output + "©";
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}


C++
// C++ program for the above approach
#include 
#include 
#include 
using namespace std;
  
// Given Expression with mapped value
const unordered_map m;
m = { { """, "\" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" } };
  
// Function that converts the given
// HTML Entity to its parsed String
string
parseInputString(string input)
{
    for (auto& it : m) {
  
        // Create ReGex Expression
        regex e(it.first);
  
        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e,
                              it.second);
    }
  
    // Return the parsed string
    return input;
}
  
// Driver Code
int main()
{
    // Given String
    string input
        = "17 > 25 and 25 < 17";
  
    // Function Call
    cout << parseInputString(input);
    return 0;
}


输出:
17 > 25 and 25 < 17

时间复杂度: O(N)
辅助空间: O(N)

方法 2 – 使用模式匹配:
以下是步骤:

  1. 遍历给定的字符串str
  2. 遍历时,如果遇到任何字符“&” ,则查找此&符号后存在哪个 HTML 实体。
  3. 在上表匹配字符的上表输出字符串添加与实体对应的字符。
  4. 遍历上述字符串后,打印输出字符串作为结果。

下面是上述方法的实现:

C++

// C++ program to Parse the HTML Entities
#include 
using namespace std;
  
class GfG {
  
public:
    string parseInputString(string input)
    {
  
        // To store parsed string
        string output = "";
  
        for (int i = 0;
             i < input.size(); i++) {
  
            // Matching pattern of html
            // entity
            if (input[i] == '&') {
                string buffer;
  
                while (i < input.size()) {
                    buffer = buffer + input[i];
  
                    // Check match for (\)
                    if (input[i] == ';'
                        && buffer == """) {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (')
                    else if (input[i] == ';'
                             && buffer == "'") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (&)
                    else if (input[i] == ';'
                             && buffer == "&") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (>)
                    else if (input[i] == ';'
                             && buffer == ">") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (<)
                    else if (input[i] == ';'
                             && buffer == "<") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (/)
                    else if (input[i] == ';'
                             && buffer == "⁄") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (" ")
                    else if (input[i] == ';'
                             && buffer == " ") {
                        output = output + " ";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (®)
                    else if (input[i] == ';'
                             && buffer == "®") {
                        output = output + "®";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (©)
                    else if (input[i] == ';'
                             && buffer == "©") {
                        output = output + "©";
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}
输出:

17 > 25 and 25 < 17

时间复杂度: O(N)
辅助空间: O(N)

方法 3 – 使用正则表达式
以下是步骤:

  1. 将所有表达式及其映射值存储在 Map M 中
  2. 对于地图中的每个,使用以下方法创建正则表达式:
  3. 现在将上面用它在 Map M 中的映射值形成的正则表达式替换为:
  4. 重复以上步骤,直到所有的表达式都没有被替换。

下面是上述方法的实现:

C++

// C++ program for the above approach
#include 
#include 
#include 
using namespace std;
  
// Given Expression with mapped value
const unordered_map m;
m = { { """, "\" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" } };
  
// Function that converts the given
// HTML Entity to its parsed String
string
parseInputString(string input)
{
    for (auto& it : m) {
  
        // Create ReGex Expression
        regex e(it.first);
  
        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e,
                              it.second);
    }
  
    // Return the parsed string
    return input;
}
  
// Driver Code
int main()
{
    // Given String
    string input
        = "17 > 25 and 25 < 17";
  
    // Function Call
    cout << parseInputString(input);
    return 0;
}
输出:
17 > 25 and 25 < 17

时间复杂度: O(N)
辅助空间: O(N)