HTML实体解析器 - 芒果文档

📌 相关文章

📜 HTML实体解析器

📅 最后修改于: 2021-05-04 13:24:28 🧑 作者: Mango

给定其中包含各种HTML实体的字符串str ，任务是将这些实体替换为其相应的特殊字符。

HTML entity parser is the parser that takes HTML code as input and replaces all the entities of the special characters by the characters itself. The special characters and their entities for HTML are Quotation Mark: the entity is ", and symbol character is “.

为什么编程需要懂一点英语

以下是HTML实体及其相应的特殊字符，如下表所示：

Name/ Description	HTML Entity	Special Character
Space
Ampersand	&	&
Greater than	>	>
Less than	<	<
Single Quotation Mark	'	‘
Double Quotation Mark	"	“
Trademark	®	®	Copyright mark	©	©	Forward Slash	⁄	⁄

例子：

Input: str = “17 > 25 and 25 < 17”
Output: 17 > 25 and 25 < 17
Explanation: In the above example > is
replaced by corresponding special character
> and < is replaced by <

Input: str = “© is symbol of copyright”
Output: © is symbol of copyright
Explanation: In the above example © is
replaced by corresponding special character
©

为什么编程需要懂一点英语

方法1 –使用unordered_map：以下是步骤：

将HTML实体及其字符在Map中。
遍历给定的字符串，如果遇到任何字符“＆” ，则查找在此“＆”号之后存在哪个HTML实体。
在输出字符串添加带有Entity的相应字符。
打印输出字符串作为结果。

下面是上述方法的实现：

C++

// C++ program for the above approach
#include 
#include 
using namespace std;
  
class GfG {
public:
    unordered_map m;
  
public:
    // Associating html entity with
    // special character
    void initializeMap()
    {
        m["""] = "\"";
        m["'"] = "'";
        m["&"] = "&";
        m[">"] = ">";
        m["<"] = "<";
        m["⁄"] = "/";
        m[" "] = " ";
        m["®"] = "®";
        m["©"] = "©";
    }
  
public:
    // Function that convert the given
    // HTML Entity to its parsed String
    string parseInputString(string input)
    {
        // Output string
        string output = "";
  
        // Traverse the string
        for (int i = 0;
             i < input.size(); i++) {
  
            // If any ampersand is occurred
            if (input[i] == '&') {
  
                string buffer;
  
                while (i < input.size()) {
  
                    buffer = buffer + input[i];
  
                    // If any Entity is found
                    if (input[i] == ';'
                        && m.find(buffer)
                               != m.end()) {
  
                        // Append the parsed
                        // character
                        output = output
                                 + m[buffer];
  
                        // Clear the buffer
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output
                             + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output
                         + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}

C++

// C++ program to Parse the HTML Entities
#include 
using namespace std;
  
class GfG {
  
public:
    string parseInputString(string input)
    {
  
        // To store parsed string
        string output = "";
  
        for (int i = 0;
             i < input.size(); i++) {
  
            // Matching pattern of html
            // entity
            if (input[i] == '&') {
                string buffer;
  
                while (i < input.size()) {
                    buffer = buffer + input[i];
  
                    // Check match for (\)
                    if (input[i] == ';'
                        && buffer == """) {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (')
                    else if (input[i] == ';'
                             && buffer == "'") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (&)
                    else if (input[i] == ';'
                             && buffer == "&") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (>)
                    else if (input[i] == ';'
                             && buffer == ">") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (<)
                    else if (input[i] == ';'
                             && buffer == "<") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (/)
                    else if (input[i] == ';'
                             && buffer == "⁄") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (" ")
                    else if (input[i] == ';'
                             && buffer == " ") {
                        output = output + " ";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (®)
                    else if (input[i] == ';'
                             && buffer == "®") {
                        output = output + "®";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (©)
                    else if (input[i] == ';'
                             && buffer == "©") {
                        output = output + "©";
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}

C++

// C++ program for the above approach
#include 
#include 
#include 
using namespace std;
  
// Given Expression with mapped value
const unordered_map m;
m = { { """, "\" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" } };
  
// Function that converts the given
// HTML Entity to its parsed String
string
parseInputString(string input)
{
    for (auto& it : m) {
  
        // Create ReGex Expression
        regex e(it.first);
  
        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e,
                              it.second);
    }
  
    // Return the parsed string
    return input;
}
  
// Driver Code
int main()
{
    // Given String
    string input
        = "17 > 25 and 25 < 17";
  
    // Function Call
    cout << parseInputString(input);
    return 0;
}

输出：

17 > 25 and 25 < 17

时间复杂度： O(N)
辅助空间： O(N)

方法2 –使用模式匹配：
步骤如下：

遍历给定的字符串str 。
在遍历时，如果遇到任何字符“＆” ，则查找在此“与”号之后存在哪个HTML实体。
从上表中的匹配字符表中的输出字符串中将相应的字符与Entity添加到输出字符串中。
遍历上面的字符串后，将输出的字符串作为结果打印。

下面是上述方法的实现：

C++

// C++ program to Parse the HTML Entities
#include 
using namespace std;
  
class GfG {
  
public:
    string parseInputString(string input)
    {
  
        // To store parsed string
        string output = "";
  
        for (int i = 0;
             i < input.size(); i++) {
  
            // Matching pattern of html
            // entity
            if (input[i] == '&') {
                string buffer;
  
                while (i < input.size()) {
                    buffer = buffer + input[i];
  
                    // Check match for (\)
                    if (input[i] == ';'
                        && buffer == """) {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (')
                    else if (input[i] == ';'
                             && buffer == "'") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (&)
                    else if (input[i] == ';'
                             && buffer == "&") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (>)
                    else if (input[i] == ';'
                             && buffer == ">") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (<)
                    else if (input[i] == ';'
                             && buffer == "<") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (/)
                    else if (input[i] == ';'
                             && buffer == "⁄") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (" ")
                    else if (input[i] == ';'
                             && buffer == " ") {
                        output = output + " ";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (®)
                    else if (input[i] == ';'
                             && buffer == "®") {
                        output = output + "®";
                        buffer = "";
                        i++;
                        break;
                    }
  
                    // Check match for (©)
                    else if (input[i] == ';'
                             && buffer == "©") {
                        output = output + "©";
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }
  
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output + input[i];
            }
        }
  
        // Return the parsed string
        return output;
    }
};
  
// Driver Code
int main()
{
    // Given String
    string input = "17 > 25 and 25 < 17";
    GfG g;
  
    // Initialised parsed string
    g.initializeMap();
  
    // Function Call
    cout << g.parseInputString(input);
    return 0;
}

输出：

17 > 25 and 25 < 17

时间复杂度： O(N)
辅助空间： O(N)

方法3 –使用正则表达式：
步骤如下：

将所有表达式及其映射值存储在Map M中。
对于映射中的每个键，使用以下命令创建一个正则表达式：

regex e(key);

为什么编程需要懂一点英语
现在，用在Map M中的映射值替换上面形成的上述正则表达式为：

regex_replace(str, e, value);
where,
str is the input string,
e is the expression formed in the above step, and
val is the value mapped with expression e in the Map

为什么编程需要懂一点英语
重复上述步骤，直到所有表达式都没有被替换。

下面是上述方法的实现：

C++

// C++ program for the above approach
#include 
#include 
#include 
using namespace std;
  
// Given Expression with mapped value
const unordered_map m;
m = { { """, "\" },
        { "'", "'" },
        { "&", "&" },
        { ">", ">" },
        { "<", "<" },
        { "⁄", "/" } };
  
// Function that converts the given
// HTML Entity to its parsed String
string
parseInputString(string input)
{
    for (auto& it : m) {
  
        // Create ReGex Expression
        regex e(it.first);
  
        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e,
                              it.second);
    }
  
    // Return the parsed string
    return input;
}
  
// Driver Code
int main()
{
    // Given String
    string input
        = "17 > 25 and 25 < 17";
  
    // Function Call
    cout << parseInputString(input);
    return 0;
}

输出：

17 > 25 and 25 < 17

时间复杂度： O(N)
辅助空间： O(N)