如果使用 iconv() 函数转换编码就相比比较简单了,不过很多虚拟主机里并不支持这个组件,我在网上找半天,才找到一个gb2312转utf-8的方法,但不能逆向转换。
这个函数如下:
/*******************************
//GB转UTF-8编码
*******************************/
function gb2utf8($gbstr) {
global $CODETABLE;
if(trim($gbstr)=="") return $gbstr;
if(empty($CODETABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while ($l = fgets($fp,15))
{ $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6); }
fclose($fp);
}
$ret = "";
$utf8 = "";
while ($gbstr) {
if (ord(substr($gbstr, 0, 1)) > 127) {
$thisW = substr($gbstr, 0, 2);
$gbstr = substr($gbstr, 2, strlen($gbstr));
$utf8 = "";
@$utf8 = u2utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080]));
if($utf8!=""){
for ($i = 0;$i < strlen($utf8);$i += 3)
$ret .= chr(substr($utf8, $i, 3));
}
}
else
{
$ret .= substr($gbstr, 0, 1);
$gbstr = substr($gbstr, 1, strlen($gbstr));
}
}
return $ret;
}
//Unicode转utf8
function u2utf8($c) {
for ($i = 0;$i < count($c);$i++)
$str = "";
if ($c < 0x80) {
$str .= $c;
} else if ($c < 0x800) {
$str .= (0xC0 | $c >> 6);
$str .= (0x80 | $c & 0x3F);
} else if ($c < 0x10000) {
$str .= (0xE0 | $c >> 12);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
} else if ($c < 0x200000) {
$str .= (0xF0 | $c >> 18);
$str .= (0x80 | $c >> 12 & 0x3F);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
}
return $str;
}
因为gb2312都是双字节的,因此转换为utf-8就相对比较简单,但反之有很麻烦了,我尝试了一下:
这样
function utf82gb($utfstr)
{
global $UC2GBTABLE;
$okstr = "";
if(trim($utfstr)=="") return $utfstr;
if(empty($UC2GBTABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while($l = fgets($fp,15))
{ $UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6));}
fclose($fp);
}
$ulen = strlen($utfstr);
for($i=0;$i<$ulen;$i++)
{
if(ord($utfstr[$i])<0x81) $okstr .= $utfstr[$i];
else
{
if($ulen>$i+2)
{
$utfc = substr($utfstr,$i,3);
$c = "";
@$c = dechex($UC2GBTABLE[utf82u_3($utfc)]+0x8080);
if($c!=""){
$okstr .= chr(hexdec($c[0].$c[1])).chr(hexdec($c[2].$c[3]));
}
}
else
{ $okstr .= $utfstr[$i]; }
}
}
$okstr = trim($okstr);
return $okstr;
}
function utf82u_3($c)
{
$n = (ord($c[0]) & 0x1f) << 12;
$n += (ord($c[1]) & 0x3f) << 6;
$n += ord($c[2]) & 0x3f;
return $n;
}
按这种方法,大部份字符也算是能转换成功的了,不过总是有点不妥之处,我把程序改成这样子:
function utf82gb($utfstr)
{
global $UC2GBTABLE;
$okstr = "";
if(trim($utfstr)=="") return $utfstr;
if(empty($UC2GBTABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while($l = fgets($fp,15))
{ $UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6));}
fclose($fp);
}
$okstr = "";
$utfstr = urlencode($utfstr);
$ulen = strlen($utfstr);
for($i=0;$i<$ulen;$i++)
{
if($utfstr[$i]=="%")
{
if($ulen>$i+2){
$hexnext = hexdec("0x".substr($utfstr,$i+1,2));
if($hexnext<127){
$okstr .= chr($hexnext);
$i = $i+2;
}
else{
if($ulen>=$i+9){
$hexnext = substr($utfstr,$i+1,8);
$c = "";
@$c = dechex($UC2GBTABLE[url_utf2u($hexnext)]+0x8080);
if($c!=""){
$okstr .= chr(hexdec($c[0].$c[1])).chr(hexdec($c[2].$c[3]));
}
$i = $i+8;
}
}
}
else
{ $okstr .= $utfstr[$i]; }
}
else if($utfstr[$i]=="+")
$okstr .= " ";
else
$okstr .= $utfstr[$i];
}
$okstr = trim($okstr);
return $okstr;
}
//三字节的URL编码转成的utf8字符转为unicode编码
function url_utf2u($c)
{
$utfc = "";
$cs = split("%",$c);
for($i=0;$i<count($cs);$i++){
$utfc .= chr(hexdec("0x".$cs[$i]));
}
$n = (ord($utfc[0]) & 0x1f) << 12;
$n += (ord($utfc[1]) & 0x3f) << 6;
$n += ord($utfc[2]) & 0x3f;
return $n;
}
这个函数如下:
/*******************************
//GB转UTF-8编码
*******************************/
function gb2utf8($gbstr) {
global $CODETABLE;
if(trim($gbstr)=="") return $gbstr;
if(empty($CODETABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while ($l = fgets($fp,15))
{ $CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6); }
fclose($fp);
}
$ret = "";
$utf8 = "";
while ($gbstr) {
if (ord(substr($gbstr, 0, 1)) > 127) {
$thisW = substr($gbstr, 0, 2);
$gbstr = substr($gbstr, 2, strlen($gbstr));
$utf8 = "";
@$utf8 = u2utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080]));
if($utf8!=""){
for ($i = 0;$i < strlen($utf8);$i += 3)
$ret .= chr(substr($utf8, $i, 3));
}
}
else
{
$ret .= substr($gbstr, 0, 1);
$gbstr = substr($gbstr, 1, strlen($gbstr));
}
}
return $ret;
}
//Unicode转utf8
function u2utf8($c) {
for ($i = 0;$i < count($c);$i++)
$str = "";
if ($c < 0x80) {
$str .= $c;
} else if ($c < 0x800) {
$str .= (0xC0 | $c >> 6);
$str .= (0x80 | $c & 0x3F);
} else if ($c < 0x10000) {
$str .= (0xE0 | $c >> 12);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
} else if ($c < 0x200000) {
$str .= (0xF0 | $c >> 18);
$str .= (0x80 | $c >> 12 & 0x3F);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
}
return $str;
}
因为gb2312都是双字节的,因此转换为utf-8就相对比较简单,但反之有很麻烦了,我尝试了一下:
这样
function utf82gb($utfstr)
{
global $UC2GBTABLE;
$okstr = "";
if(trim($utfstr)=="") return $utfstr;
if(empty($UC2GBTABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while($l = fgets($fp,15))
{ $UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6));}
fclose($fp);
}
$ulen = strlen($utfstr);
for($i=0;$i<$ulen;$i++)
{
if(ord($utfstr[$i])<0x81) $okstr .= $utfstr[$i];
else
{
if($ulen>$i+2)
{
$utfc = substr($utfstr,$i,3);
$c = "";
@$c = dechex($UC2GBTABLE[utf82u_3($utfc)]+0x8080);
if($c!=""){
$okstr .= chr(hexdec($c[0].$c[1])).chr(hexdec($c[2].$c[3]));
}
}
else
{ $okstr .= $utfstr[$i]; }
}
}
$okstr = trim($okstr);
return $okstr;
}
function utf82u_3($c)
{
$n = (ord($c[0]) & 0x1f) << 12;
$n += (ord($c[1]) & 0x3f) << 6;
$n += ord($c[2]) & 0x3f;
return $n;
}
按这种方法,大部份字符也算是能转换成功的了,不过总是有点不妥之处,我把程序改成这样子:
function utf82gb($utfstr)
{
global $UC2GBTABLE;
$okstr = "";
if(trim($utfstr)=="") return $utfstr;
if(empty($UC2GBTABLE)){
$filename = dirname(__FILE__)."/gb2312-utf8.table";
$fp = fopen($filename,"r");
while($l = fgets($fp,15))
{ $UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6));}
fclose($fp);
}
$okstr = "";
$utfstr = urlencode($utfstr);
$ulen = strlen($utfstr);
for($i=0;$i<$ulen;$i++)
{
if($utfstr[$i]=="%")
{
if($ulen>$i+2){
$hexnext = hexdec("0x".substr($utfstr,$i+1,2));
if($hexnext<127){
$okstr .= chr($hexnext);
$i = $i+2;
}
else{
if($ulen>=$i+9){
$hexnext = substr($utfstr,$i+1,8);
$c = "";
@$c = dechex($UC2GBTABLE[url_utf2u($hexnext)]+0x8080);
if($c!=""){
$okstr .= chr(hexdec($c[0].$c[1])).chr(hexdec($c[2].$c[3]));
}
$i = $i+8;
}
}
}
else
{ $okstr .= $utfstr[$i]; }
}
else if($utfstr[$i]=="+")
$okstr .= " ";
else
$okstr .= $utfstr[$i];
}
$okstr = trim($okstr);
return $okstr;
}
//三字节的URL编码转成的utf8字符转为unicode编码
function url_utf2u($c)
{
$utfc = "";
$cs = split("%",$c);
for($i=0;$i<count($cs);$i++){
$utfc .= chr(hexdec("0x".$cs[$i]));
}
$n = (ord($utfc[0]) & 0x1f) << 12;
$n += (ord($utfc[1]) & 0x3f) << 6;
$n += ord($utfc[2]) & 0x3f;
return $n;
}
作者:jackxiang@向东博客 专注WEB应用 构架之美 --- 构架之美,在于尽态极妍 | 应用之美,在于药到病除
地址:https://jackxiang.com/post/857/
版权所有。转载时必须以链接形式注明作者和原始出处及本声明!
评论列表