Image

Recently I was presented with the problem of decoding and encoding HTML character entities with JavaScript. The task seems simple! However there are a few gotchas that I’d like to point out as well as show a bit of code that performs the task.

The real work of the code is just these few lines:

HtmlEntities.decode = function(string) {
    var entityMap = HtmlEntities.map;
    for (var key in entityMap) {
        var entity = entityMap[key];
        var regex = new RegExp(entity, 'g');
        string = string.replace(regex, key);
    }
    string = string.replace(/"/g, '"');
    string = string.replace(/&/g, '&');
    return string;
}

HtmlEntities.encode = function(string) {
    var entityMap = HtmlEntities.map;
    string = string.replace(/&/g, '&');
    string = string.replace(/"/g, '"');
    for (var key in entityMap) {
        var entity = entityMap[key];
        var regex = new RegExp(key, 'g');
        string = string.replace(regex, entity);
    }
    return string;
}

The power of this comes from the map.

HtmlEntities.map = {
    "'": "'",
    "<": "<",
    ">": ">",
    " ": " ",
    "¡": "¡",
    "¢": "¢",
    "£": "£",
    "¤": "¤",
    "¥": "¥",
    "¦": "¦",
    "§": "§",
    "¨": "¨",
    "©": "©",
    "ª": "ª",
    "«": "«",
    "¬": "¬",
    "®": "®",
    "¯": "¯",
    "°": "°",
    "±": "±",
    "²": "²",
    "³": "³",
    "´": "´",
    "µ": "µ",
    "¶": "¶",
    "·": "·",
    "¸": "¸",
    "¹": "¹",
    "º": "º",
    "»": "»",
    "¼": "¼",
    "½": "½",
    "¾": "¾",
    "¿": "¿",
    "À": "À",
    "Á": "Á",
    "Â": "Â",
    "Ã": "Ã",
    "Ä": "Ä",
    "Å": "Å",
    "Æ": "Æ",
    "Ç": "Ç",
    "È": "È",
    "É": "É",
    "Ê": "Ê",
    "Ë": "Ë",
    "Ì": "Ì",
    "Í": "Í",
    "Î": "Î",
    "Ï": "Ï",
    "Ð": "Ð",
    "Ñ": "Ñ",
    "Ò": "Ò",
    "Ó": "Ó",
    "Ô": "Ô",
    "Õ": "Õ",
    "Ö": "Ö",
    "×": "×",
    "Ø": "Ø",
    "Ù": "Ù",
    "Ú": "Ú",
    "Û": "Û",
    "Ü": "Ü",
    "Ý": "Ý",
    "Þ": "Þ",
    "ß": "ß",
    "à": "à",
    "á": "á",
    "â": "â",
    "ã": "ã",
    "ä": "ä",
    "å": "å",
    "æ": "æ",
    "ç": "ç",
    "è": "è",
    "é": "é",
    "ê": "ê",
    "ë": "ë",
    "ì": "ì",
    "í": "í",
    "î": "î",
    "ï": "ï",
    "ð": "ð",
    "ñ": "ñ",
    "ò": "ò",
    "ó": "ó",
    "ô": "ô",
    "õ": "õ",
    "ö": "ö",
    "÷": "÷",
    "ø": "ø",
    "ù": "ù",
    "ú": "ú",
    "û": "û",
    "ü": "ü",
    "ý": "ý",
    "þ": "þ",
    "ÿ": "ÿ",
    "Œ": "Œ",
    "œ": "œ",
    "Š": "Š",
    "š": "š",
    "Ÿ": "Ÿ",
    "ƒ": "ƒ",
    "ˆ": "ˆ",
    "˜": "˜",
    "Α": "Α",
    "Β": "Β",
    "Γ": "Γ",
    "Δ": "Δ",
    "Ε": "Ε",
    "Ζ": "Ζ",
    "Η": "Η",
    "Θ": "Θ",
    "Ι": "Ι",
    "Κ": "Κ",
    "Λ": "Λ",
    "Μ": "Μ",
    "Ν": "Ν",
    "Ξ": "Ξ",
    "Ο": "Ο",
    "Π": "Π",
    "Ρ": "Ρ",
    "Σ": "Σ",
    "Τ": "Τ",
    "Υ": "Υ",
    "Φ": "Φ",
    "Χ": "Χ",
    "Ψ": "Ψ",
    "Ω": "Ω",
    "α": "α",
    "β": "β",
    "γ": "γ",
    "δ": "δ",
    "ε": "ε",
    "ζ": "ζ",
    "η": "η",
    "θ": "θ",
    "ι": "ι",
    "κ": "κ",
    "λ": "λ",
    "μ": "μ",
    "ν": "ν",
    "ξ": "ξ",
    "ο": "ο",
    "π": "π",
    "ρ": "ρ",
    "ς": "ς",
    "σ": "σ",
    "τ": "τ",
    "υ": "υ",
    "φ": "φ",
    "χ": "χ",
    "ψ": "ψ",
    "ω": "ω",
    "ϑ": "ϑ",
    "ϒ": "&Upsih;",
    "ϖ": "ϖ",
    "–": "–",
    "—": "—",
    "‘": "‘",
    "’": "’",
    "‚": "‚",
    "“": "“",
    "”": "”",
    "„": "„",
    "†": "†",
    "‡": "‡",
    "•": "•",
    "…": "…",
    "‰": "‰",
    "′": "′",
    "″": "″",
    "‹": "‹",
    "›": "›",
    "‾": "‾",
    "⁄": "⁄",
    "€": "€",
    "ℑ": "ℑ",
    "℘": "℘",
    "ℜ": "ℜ",
    "™": "™",
    "ℵ": "ℵ",
    "←": "←",
    "↑": "↑",
    "→": "→",
    "↓": "↓",
    "↔": "↔",
    "↵": "↵",
    "⇐": "⇐",
    "⇑": "&UArr;",
    "⇒": "⇒",
    "⇓": "⇓",
    "⇔": "⇔",
    "∀": "∀",
    "∂": "∂",
    "∃": "∃",
    "∅": "∅",
    "∇": "∇",
    "∈": "∈",
    "∉": "∉",
    "∋": "∋",
    "∏": "∏",
    "∑": "∑",
    "−": "−",
    "∗": "∗",
    "√": "√",
    "∝": "∝",
    "∞": "∞",
    "∠": "∠",
    "∧": "∧",
    "∨": "∨",
    "∩": "∩",
    "∪": "∪",
    "∫": "∫",
    "∴": "∴",
    "∼": "∼",
    "≅": "≅",
    "≈": "≈",
    "≠": "≠",
    "≡": "≡",
    "≤": "≤",
    "≥": "≥",
    "⊂": "⊂",
    "⊃": "⊃",
    "⊄": "⊄",
    "⊆": "⊆",
    "⊇": "⊇",
    "⊕": "⊕",
    "⊗": "⊗",
    "⊥": "⊥",
    "⋅": "⋅",
    "⌈": "⌈",
    "⌉": "⌉",
    "⌊": "⌊",
    "⌋": "⌋",
    "⟨": "⟨",
    "⟩": "⟩",
    "◊": "◊",
    "♠": "♠",
    "♣": "♣",
    "♥": "♥",
    "♦": "♦"
};

Note how the character entity is used as the key and the HTML entity is the value of the map. This allows for elegant pairing of the two values and easy access in a loop.

Now for the fun part! The replacement of text via Regular Expressions. The character or entity is replaced in the given text via a global regular expression. This means that every instance of the given string will be replaced with another given string.

In order to properly encode and decode characters we must take into consideration a few gotchas:

  1. The " and & cannot exist in the map
  2. The & must be encoded first and decoded last
  3. The “ must be encoded after the & and decoded before the &

Why you ask?

First the " cannot be the key of the JavaScript map object. All of the other characters are surrounded by double quotes. While the ‘ can be used as a key if surrounded by double quotes, the quotes surrounding the key cannot be changed just for one key value pair. This means a choice must be made, exclude the single or double quote? I chose the double.

Next the & cannot exist in the map because it must be handled specially. If the loop is running and all of the &’s are encoded after other characters have been encoded, then the encoding of the &’s will duplicate HTML entities that were not meant to be created. Example:

  1. Raw text is 'This is < that & done!'
  2. We encode the < into &amp;lt;
  3. Next when we go to encode the &amp; to be &amp;amp; we also find the &amp; in &amp;lt;
  4. The result is 'This is &amp;amp;lt; that &amp;amp; done!'
  5. The encode and decode of the same text will now differ

To avoid this we first encode all of the & with

string = string.replace(/&amp;/g, '&amp;amp;');

Now that we don’t have to worry about duplicating &amp;amp; all over the place we can safely continue by encoding the ".

string = string.replace(/"/g, '&amp;quot;');

With the special cases out of the way we can continue with the rest of the map.

for (var key in entityMap) {
    var entity = entityMap[key];
    var regex = new RegExp(key, 'g');
    string = string.replace(regex, entity);
}
return string;

The process is reversed upon decode:

for (var key in entityMap) {
    var entity = entityMap[key];
    var regex = new RegExp(entity, 'g');
    string = string.replace(regex, key);
}
string = string.replace(/&amp;quot;/g, '"');
string = string.replace(/&amp;amp;/g, '&amp;');

If you want to mess around with an example here it is on JS Fiddle.

View the project on GitHub.