我有一个数据库,其中一些元素由HTML特殊字符组成:
| Universidad Tecnológica Nacional - UTN |
| Instituto Tecnológico de Buenos Aires |
| Instituto Superior del Profesorado "Dr. Joaquín V. González" |
| Escuela Nacional de Náutica "Manuel Belgrano" |
| Conservatorio Nacional de Música "Carlos López Buchardo" |
| Instituto Argentino de Computacion - IAC |
| Conservatorio de Superior de Música "Manuel de Falla" |
我需要把它转换成一个合适的UTF格式。我能做得比遍历数据库,从每个代码到等价符号的映射更好吗?
á -> 'á'
" -> '"'
...
最佳答案
正如my comment above中所提到的,在你自己的情况下,你想做什么是非常不清楚的。
我能做得比遍历数据库,从每个代码到等价符号的映射更好吗?
嗯,是的。您可以用替换字符替换字符代码实体(例如{
和ƫ
),而无需在“映射”中查找字符代码。但命名实体(例如"
)将始终需要查找。
以下是我试图解决的一般案件:
创建一个表以存储在HTML中定义的命名字符实体:
CREATE TABLE ents (
ref VARCHAR(8) NOT NULL COLLATE utf8_bin,
rep CHAR(1) NOT NULL,
PRIMARY KEY (ref)
);
填充此表-我建议使用脚本,例如来自PHP的脚本:
$dbh = new PDO("mysql:dbname=$dbname", $username, $password);
$dbh->setAttribute(PDO::ATTR_EMULATE_PREPARES, FALSE);
$ins = $dbh->prepare('INSERT INTO ents (ref, rep) VALUES (?, ?)');
$t = get_html_translation_table(HTML_ENTITIES);
foreach ($t as $k => $v) $ins->execute([substr($v, 1, -1), $k]);
定义一个SQL函数来执行实体替换(在适用的情况下使用此表,或者按字符代码执行):
DELIMITER ;;
CREATE FUNCTION dhe(s TEXT) RETURNS TEXT
BEGIN
DECLARE n, p, i, t INT DEFAULT 0;
DECLARE r VARCHAR(12);
entity_search: LOOP
SET n := LOCATE('&', s, n+1);
IF (!n) THEN
LEAVE entity_search;
END IF;
IF (SUBSTRING(s, n+1, 1) = '#') THEN
CASE
WHEN SUBSTRING(s, n+2, 1) RLIKE '[[:digit:]]' THEN
SET t := 2, p := n+2, r := '[[:digit:]]';
WHEN SUBSTRING(s, n+2, 1) = 'x' THEN
SET t := 3, p := n+3, r := '[[:xdigit:]]';
ELSE ITERATE entity_search;
END CASE;
ELSE
SET t := 1, p := n+1, r := '[[:alnum:]_]';
END IF;
SET i := 0;
reference: LOOP
IF SUBSTRING(s, p+i, 1) NOT RLIKE r THEN
IF SUBSTRING(s, p+i, 1) RLIKE '[[:alnum:]_]' THEN
ITERATE entity_search;
END IF;
LEAVE reference;
END IF;
IF i = 8 THEN ITERATE entity_search; END IF;
SET i := i + 1;
END LOOP reference;
SET s := CONCAT(
LEFT(s, n-1),
CASE t
WHEN 1 THEN COALESCE(
(SELECT rep FROM ents WHERE ref = SUBSTRING(s, p, i))
, SUBSTRING(s, n, i + IF(SUBSTRING(s, p+i, 1)=';',1,0))
)
WHEN 2 THEN CHAR(SUBSTRING(s, p, i))
WHEN 3 THEN CHAR(CONV(SUBSTRING(s, p, i), 16, 10))
END,
SUBSTRING(s, p + i + IF(SUBSTRING(s, p+i, 1)=';',1,0))
);
END LOOP entity_search;
RETURN s;
END;;
DELIMITER ;
应用此函数两次来解码(显然)双编码表:
UPDATE my_table SET my_column = dhe(dhe(my_column));