latin1 vs utf8

For testing encoding on serverside (with HEX()) and clientside (with bin2hex()). Also to test what happens with these encodings when the utf8-variant reads from the latin1 table and vice versa.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
<?php
/*
CREATE TABLE `test_latin1` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `tekst` text NOT NULL,
  `tekst_blob` blob,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1
*/
class test
{
    protected $db;
    protected $table 'test_latin1';

    public function __construct() {
        $this->db = new mysqli('localhost''test''test''test') or die('connecting failed');
        $this->db->set_charset('latin1') or die('failed setting charset');
    }

    protected function isIndex($in) {
        return preg_match('#^[1-9][0-9]*$#'$in);
    }

    protected function escape($in) {
        return htmlspecialchars($inENT_QUOTES'ISO-8859-1');
    }

    protected function redirect($url) {
        header('Location: '.$url);
        exit;
    }

    protected function __header() {
        header('Content-Type: text/html; charset=ISO-8859-1');
?><!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>latin1 test</title>
<style type="text/css">
<!--
pre             { font-family: monospace; font-size: 12pt; padding: 10px; background-color: #cccccc; margin: 5px 0; }
pre.hex-server  { background-color: #ffcccc; font-size: 10pt; }
pre.hex-client  { background-color: #ccffcc; font-size: 10pt; }
//-->
</style>
</head>

<body>
<?php
    }

    protected function __footer() {
?></body>
</html><?php
    }

    public function execute() {
        $action 'action'.(isset($_GET['action']) ? ucfirst($_GET['action']) : 'Default');
        if (method_exists($this$action)) {
            $this->$action();
        } else {
            $this->actionDefault();
        }
    }

    protected function actionDefault() {
        $this->__header();

            /* @var $res mysqli_result */
            $res $this->db->query(
                'SELECT id, tekst, HEX(tekst) AS test
                FROM '.$this->table.'
                ORDER BY id'
            ) or die('query failed');
            while ($row $res->fetch_assoc()) {
                $editLink   '?action=edit&id='.$row['id'];
                $deleteLink '?action=delete&id='.$row['id'];
                ?><a href="<?php echo $this->escape($editLink?>">edit</a> |
                <a href="<?php echo $this->escape($deleteLink?>">delete</a>
                <pre><?php echo $this->escape($row['tekst']) ?></pre>
                <pre class="hex-server"><?php echo $this->escape(trim(chunk_split($row['test'], 64"\n"))) ?></pre><?php
                // By calling bin2hex() you can display the local retrieved hex value of a (binary) string.
                // If it differs in any way from the HEX value on the server, this means that MySQL performed
                // conversions. Note that this only happens when the text contains characters that have different
                // encodings in different character sets so "normal" (ASCII) characters are no good way to check
                // whether conversions have been done.
                ?><pre class="hex-client"><?php echo $this->escape(trim(chunk_split(strtoupper(bin2hex($row['tekst'])), 64"\n"))) ?></pre>
                <hr noshade="noshade" /><?php
            }
            $res->free();

            $formAction '?action=add';
            ?><form action="<?php echo $this->escape($formAction?>" method="post" accept-charset="ISO-8859-1">
            <textarea name="tekst" rows="10" cols="80"></textarea><br />
            <button type="submit">submit</button>
            </form><?php
        $this->__footer();
    }

    protected function actionAdd() {
        if ($_SERVER['REQUEST_METHOD'] == 'POST') {
            $this->db->query(
                "INSERT INTO ".$this->table." (tekst, tekst_blob) VALUES (
                    '".$this->db->real_escape_string($_POST['tekst'])."',
                    '".$this->db->real_escape_string($_POST['tekst'])."'
                )"
            ) or die('query failed');
        }
        $this->redirect('?action=default');
    }

    protected function actionEdit() {
        $this->__header();
            $id = isset($_GET['id']) && $this->isIndex($_GET['id']) ? $_GET['id'] : false;

            if ($id) {
                /* @var $res mysqli_result */
                $res $this->db->query(
                    'SELECT *
                    FROM '.$this->table.'
                    WHERE id = '.$this->db->real_escape_string($id)
                );
                if ($res->num_rows) {
                    $row $res->fetch_assoc();
                    $formAction '?action=editProcess&id='.$row['id'];
                    ?><form action="<?php echo $this->escape($formAction?>" method="post" accept-charset="ISO-8859-1">
                    <textarea name="tekst" rows="10" cols="80"><?php echo $this->escape($row['tekst']) ?></textarea><br />
                    <button type="submit">submit</button>
                    </form><?php
                } else {
                    ?><p>[error] no data found for id</p><?php
                }
                $res->free();
            } else {
                ?><p>[error] invalid id</p><?php
            }
        $this->__footer();
    }

    protected function actionEditProcess() {
        if ($_SERVER['REQUEST_METHOD'] == 'POST') {
            if (isset($_GET['id']) && $this->isIndex($_GET['id'])) {
                $this->db->query(
                    "UPDATE ".$this->table." SET
                    tekst = '".$this->db->real_escape_string($_POST['tekst'])."',
                    tekst_blob = '".$this->db->real_escape_string($_POST['tekst'])."'
                    WHERE id = ".$this->db->real_escape_string($_GET['id'])
                ) or die('query failed');
            }
        }
        $this->redirect('?action=default');
    }

    protected function actionDelete() {
        if (isset($_GET['id']) && $this->isIndex($_GET['id'])) {
            $this->db->query(
                'DELETE FROM '.$this->table.'
                WHERE id = '.$this->db->real_escape_string($_GET['id'])
            );
        }
        $this->redirect('?action=default');
    }
} // class

$test = new test();
$test->execute();
?>

And the utf8 equivalent:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
<?php
/*
CREATE TABLE `test_utf8` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `tekst` text NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8
 */
class test
{
    protected $db;

    protected $table 'test_utf8';

    public function __construct() {
        $this->db = new mysqli('localhost''test''test''test') or die('connecting failed');
        $this->db->set_charset('utf8') or die('failed setting charset');
    }

    protected function isIndex($in) {
        return preg_match('#^[1-9][0-9]*$#'$in);
    }

    protected function escape($in) {
        return htmlspecialchars($inENT_QUOTES'UTF-8');
    }

    protected function redirect($url) {
        header('Location: '.$url);
        exit;
    }

    protected function __header() {
        header('Content-Type: text/html; charset=UTF-8');
?><!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>UTF-8 test</title>
<style type="text/css">
<!--
pre             { font-family: monospace; font-size: 12pt; padding: 10px; background-color: #cccccc; margin: 5px 0; }
pre.hex-server  { background-color: #ffcccc; font-size: 10pt; }
pre.hex-client  { background-color: #ccffcc; font-size: 10pt; }
//-->
</style>
</head>

<body>
<?php
    }

    protected function __footer() {
?></body>
</html><?php
    }

    public function execute() {
        $action 'action'.(isset($_GET['action']) ? ucfirst($_GET['action']) : 'Default');
        if (method_exists($this$action)) {
            $this->$action();
        } else {
            $this->actionDefault();
        }
    }

    protected function actionDefault() {
        $this->__header();

            /* @var $res mysqli_result */
            $res $this->db->query(
                'SELECT id, tekst, HEX(tekst) AS test
                FROM '.$this->table.'
                ORDER BY id'
            ) or die('query failed');
            while ($row $res->fetch_assoc()) {
                $editLink   '?action=edit&id='.$row['id'];
                $deleteLink '?action=delete&id='.$row['id'];
                ?><a href="<?php echo $this->escape($editLink?>">edit</a> |
                <a href="<?php echo $this->escape($deleteLink?>">delete</a>
                <pre><?php echo $this->escape($row['tekst']) ?></pre>
                <pre class="hex-server"><?php echo $this->escape(trim(chunk_split($row['test'], 64"\n"))) ?></pre><?php
                // By calling bin2hex() you can display the local retrieved hex value of a (binary) string.
                // If it differs in any way from the HEX value on the server, this means that MySQL performed
                // conversions. Note that this only happens when the text contains characters that have different
                // encodings in different character sets so "normal" (ASCII) characters are no good way to check
                // whether conversions have been done.
                ?><pre class="hex-client"><?php echo $this->escape(trim(chunk_split(strtoupper(bin2hex($row['tekst'])), 64"\n"))) ?></pre>
                <hr noshade="noshade" /><?php
            }
            $res->free();

            $formAction '?action=add';
            ?><form action="<?php echo $this->escape($formAction?>" method="post" accept-charset="UTF-8">
            <textarea name="tekst" rows="10" cols="80"></textarea><br />
            <button type="submit">submit</button>
            </form><?php
        $this->__footer();
    }

    protected function actionAdd() {
        if ($_SERVER['REQUEST_METHOD'] == 'POST') {
            $this->db->query(
                "INSERT INTO ".$this->table." (tekst) VALUES (
                    '".$this->db->real_escape_string($_POST['tekst'])."'
                )"
            ) or die('query failed');
        }
        $this->redirect('?action=default');
    }

    protected function actionEdit() {
        $this->__header();
            $id = isset($_GET['id']) && $this->isIndex($_GET['id']) ? $_GET['id'] : false;

            if ($id) {
                /* @var $res mysqli_result */
                $res $this->db->query(
                    'SELECT *
                    FROM '.$this->table.'
                    WHERE id = '.$this->db->real_escape_string($id)
                );
                if ($res->num_rows) {
                    $row $res->fetch_assoc();
                    $formAction '?action=editProcess&id='.$row['id'];
                    ?><form action="<?php echo $this->escape($formAction?>" method="post" accept-charset="UTF-8">
                    <textarea name="tekst" rows="10" cols="80"><?php echo $this->escape($row['tekst']) ?></textarea><br />
                    <button type="submit">submit</button>
                    </form><?php
                } else {
                    ?><p>[error] no data found for id</p><?php
                }
                $res->free();
            } else {
                ?><p>[error] invalid id</p><?php
            }
        $this->__footer();
    }

    protected function actionEditProcess() {
        if ($_SERVER['REQUEST_METHOD'] == 'POST') {
            if (isset($_GET['id']) && $this->isIndex($_GET['id'])) {
                $this->db->query(
                    "UPDATE ".$this->table." SET
                        tekst = '".$this->db->real_escape_string($_POST['tekst'])."'
                    WHERE id = ".$this->db->real_escape_string($_GET['id'])
                ) or die('query failed');
            }
        }
        $this->redirect('?action=default');
    }

    protected function actionDelete() {
        if (isset($_GET['id']) && $this->isIndex($_GET['id'])) {
            $this->db->query(
                'DELETE FROM '.$this->table.'
                WHERE id = '.$this->db->real_escape_string($_GET['id'])
            );
        }
        $this->redirect('?action=default');
    }
} // class

$test = new test();
$test->execute();
?>

Converting the latin1 table (and all the data therein) to utf8 is simply a matter of altering the table charset:

ALTER TABLE test_latin1 CONVERT TO CHARACTER SET utf8;

Afterwards you can still read from it with the latin1 setup. As long as the data within the table is encoded as prescribed by the column or table definition, MySQL will translate the data to the desired character set defined in the _set_charset($charset) method. At least as much as possible, if unsupported (multibyte) characters cannot be translated (for example, from utf8 to latin1), MySQL will return question marks (?) for these symbols. It goes without saying that your data must be stored as defined by the column or table encoding, otherwise there is a large chance that if translations are performed by MySQL they will result in malformed data. In short, like some would say: shit in, shit out.

Consider $charset the character set in which you want to display your data (as defined by the charset in your PHP or meta Content-Type header). This is not necessarily the character set of the database, table or column you read from. But as said before, MySQL will take care of these translations as long as your data is actually of the prescribed character encoding. Ideally everything (Content Type charset, connection type, table and/or column type) has the same encoding, so no (unnecessary) translations have to be performed.

When diving into all this theory it surprised me how well MySQL copes with these character set inconsistencies. If you play by the rules data is stored and displayed correctly, if not, You Are Doing It Wrong.

How to fix utf8 tables with latin1 encoded data

I've found some topics on this subject on the internet. One of the more notable (but maybe also somewhat paranoid) can be found in the Wordpress Codex.

There are a lot of different ways to solve this mess, usually it involves altering column definitions a couple of times along the way. Another way is to introduce temporary columns to move and convert your data leaving your original data unaltered during this process. One of the benefits of this is that you can tinker a bit with your data without (further) messing up the original. When the moving and converting seems succesful (obviously YOU need to test this), you can remove the source column and rename the destination column to your source column. But first of all, you need to establish if something is wrong. Before you start converting anything you should always create a backup of your database. You should also never do this on a live database...

A while back I noticed I had not selected a charset when connecting to a database with an utf8 table to insert bookmark-data. Derp! Everything will appear to work fine as long as you fetch the data in the same way, but in the background MySQL will perform translations on your data which will cause some needless overhead. Apart from that your data appears to take up more space than when it was properly encoded.

So here's the setup: I have an UTF-8 encoded page for displaying data from and inserting data into an utf8 table, but I did not select a charset when connecting to store and fetch my data. My default connection charset was probably latin1. When I display a link from a youtube clip, it looks just fine:

【Hatsune Miku】 Triple Baka 「驫麤~とりぷるばか~」fullver (English Subs) - YouTube

Then I decided to connect in the proper way by selecting the correct charset (utf8). My link then looks like this:

【Hatsune Miku】 Triple Baka 「驫麤~とりぷるばか~」fullver (English Subs) - YouTube

Oops.

One way to ascertain whether data is stored in the wrong way (it was pretty obvious in my case) is to store it in a table in the right way by ensuring that everything is UTF-8 (see the above utf8 script above), and then compare bytes with the HEX() function. HEX() is not character encoding aware so it will always be unambiguous, no matter what table, column or connection type you use. The wrongly latin1 encoded link title looks like this when HEXed (for readability divided into chunks of 64 characters):

C3A3E282ACC29048617473756E65204D696B75C3A3E282ACE280982054726970
6C652042616B6120C3A3E282ACC592C3A9C2A9C2ABC3A9C2BAC2A4C3AFC2BDC5
BEC3A3C281C2A8C3A3E2809AC5A0C3A3C281C2B7C3A3E2809AE280B9C3A3C281
C2B0C3A3C281E280B9C3AFC2BDC5BEC3A3E282ACC28D66756C6C766572202845
6E676C697368205375627329202D20596F7554756265

While it should actually look like this:

E3809048617473756E65204D696B75E3809120547269706C652042616B6120E3
808CE9A9ABE9BAA4EFBD9EE381A8E3828AE381B7E3828BE381B0E3818BEFBD9E
E3808D66756C6C7665722028456E676C697368205375627329202D20596F7554
756265

This relatively short text is 51 bytes (~17%) smaller when encoded correctly (this, of course, depends heavily on the text it represents). Plus you have no translation overhead when you store and retrieve everything in UTF-8 (this will always hold; no translation = less strain on MySQL).

So how to get from a latin1-encoded-data-in-an-utf8-table to an utf8-encoded-data-in-an-utf8-table? One way to achieve this is by using the CAST() and CONVERT() functions. Also, use the correct charset when verifying if this gives the desired result. It is no use testing with faulty settings (by selecting the wrong charset or none at all). This is what caused the entire mess in the first place! I managed to convert the latin1-data in one go, I verified this by actually printing the translated string (which looked okay) but also by checking if the HEX value corresponds with the one I got when properly inserting it. The magic formula is thus (remember to use the right connection charset):

SELECT link_name,
    HEX(link_name) AS link_hex,
    CONVERT(CAST(CONVERT(link_name USING latin1) AS BINARY) USING utf8) AS link_name_utf8,
    HEX(CONVERT(CAST(CONVERT(link_name USING latin1) AS BINARY) USING utf8)) AS link_hex_utf8
FROM links
WHERE link_id = 1234

This seems to do the trick for VARCHAR columns, you might need something else than BINARY for TEXT columns and the like (you might run into length limitations and this might cause your data to be truncated so take care). It yields the following result (somewhat adjusted for readability like before):

link_name:
【Hatsune Miku】 Triple Baka 「驫麤~とりぷるばか~」fullver (English Subs) - YouTube

link_hex:
C3A3E282ACC29048617473756E65204D696B75C3A3E282ACE280982054726970
6C652042616B6120C3A3E282ACC592C3A9C2A9C2ABC3A9C2BAC2A4C3AFC2BDC5
BEC3A3C281C2A8C3A3E2809AC5A0C3A3C281C2B7C3A3E2809AE280B9C3A3C281
C2B0C3A3C281E280B9C3AFC2BDC5BEC3A3E282ACC28D66756C6C766572202845
6E676C697368205375627329202D20596F7554756265

link_name_utf8:
【Hatsune Miku】 Triple Baka 「驫麤~とりぷるばか~」fullver (English Subs) - YouTube

link_hex_utf8:
E3809048617473756E65204D696B75E3809120547269706C652042616B6120E3
808CE9A9ABE9BAA4EFBD9EE381A8E3828AE381B7E3828BE381B0E3818BEFBD9E
E3808D66756C6C7665722028456E676C697368205375627329202D20596F7554
756265

The second HEX value matches the HEX value of the correctly inserted link title so the CAST/CONVERT thingy seems to have done its job.

My bookmarks are safe.

^^;