Javascript: read ASCII characters from Unicode string


Quick ASCII characters reference:

{
"31": "",      "32": " ",     "33": "!",     "34": "\"",    "35": "#",    
"36": "$",     "37": "%",     "38": "&",     "39": "'",     "40": "(",    
"41": ")",     "42": "*",     "43": "+",     "44": ",",     "45": "-",    
"46": ".",     "47": "/",     "48": "0",     "49": "1",     "50": "2",    
"51": "3",     "52": "4",     "53": "5",     "54": "6",     "55": "7",    
"56": "8",     "57": "9",     "58": ":",     "59": ";",     "60": "<",    
"61": "=",     "62": ">",     "63": "?",     "64": "@",     "65": "A",    
"66": "B",     "67": "C",     "68": "D",     "69": "E",     "70": "F",    
"71": "G",     "72": "H",     "73": "I",     "74": "J",     "75": "K",    
"76": "L",     "77": "M",     "78": "N",     "79": "O",     "80": "P",    
"81": "Q",     "82": "R",     "83": "S",     "84": "T",     "85": "U",    
"86": "V",     "87": "W",     "88": "X",     "89": "Y",     "90": "Z",    
"91": "[",     "92": "\\",    "93": "]",     "94": "^",     "95": "_",    
"96": "`",     "97": "a",     "98": "b",     "99": "c",     "100": "d",    
"101": "e",    "102": "f",    "103": "g",    "104": "h",    "105": "i",    
"106": "j",    "107": "k",    "108": "l",    "109": "m",    "110": "n",    
"111": "o",    "112": "p",    "113": "q",    "114": "r",    "115": "s",    
"116": "t",    "117": "u",    "118": "v",    "119": "w",    "120": "x",    
"121": "y",    "122": "z",    "123": "{",    "124": "|",    "125": "}",    
"126": "~",    "127": ""
}

JavaScript stores strings as UTF-16 (double byte)

> "\u{2122}"
"™"

> "\u2122"
"™"

> "\u{1f4ab}"
"💫"

> "\uD83D\uDCAB"
"💫"

If you use method String.charCodeAt, you will get UTF-16 code, not ASCII code:

> "™".charCodeAt(0)
8482

> "💫".charCodeAt(0)
55357

> "💫".charCodeAt(1)
56491

> "💫".codePointAt(0)
128171

So if you want to read ASCII characters, you can do like this method: read code point of string then convert it into ASCII codes. To do this, read this artice: PHP function chr of unicode character

We write function codePointToASCIIs base on function unichr3

Javascript code:

function codePointToASCIIs(dec) {
  var utf = [];
  if (dec < 0x80) {
    utf = [dec];
  } else if (dec < 0x0800) {
    utf = [0xC0 + (dec >> 6)];
    utf[1] = 0x80 + (dec & 0x3f);
  } else if (dec < 0x010000) {
    utf = [0xE0 + (dec >> 12)];
    utf[1] = 0x80 + ((dec >> 6) & 0x3f);
    utf[2] = 0x80 + (dec & 0x3f);
  } else if (dec < 0x200000) {
    utf = [0xF0 + (dec >> 18)];
    utf[1] = 0x80 + ((dec >> 12) & 0x3f);
    utf[2] = 0x80 + ((dec >> 6) & 0x3f);
    utf[3] = 0x80 + (dec & 0x3f);
  } else return [];
  return utf;
}

Then write function stringtoasciis

function stringtoasciis(string) {
  var ret = [],
    i = 0;
  while (1) {
    var _ = string.charCodeAt(i++)
    if (isNaN(_)) break;
    var __ = codePointToASCIIs(_)
    ret = ret.concat(__);
  }
  return ret;
}

Example:

var string = "💩™💫";

alert(stringtoasciis(string))

Result

237,160,189,237,178,169,226,132,162,237,160,189,237,178,171

Try it yourself

1 Comment

Leave a Reply