00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00136 class t3lib_cs {
00137 var $noCharByteVal=63;
00138
00139
00140 var $parsedCharsets=array();
00141
00142
00143 var $caseFolding=array();
00144
00145
00146 var $toASCII=array();
00147
00148
00149 var $twoByteSets=array(
00150 'ucs-2'=>1,
00151 );
00152
00153
00154 var $fourByteSets=array(
00155 'ucs-4'=>1,
00156 'utf-32'=>1,
00157 );
00158
00159
00160 var $eucBasedSets=array(
00161 'gb2312'=>1,
00162 'big5'=>1,
00163 'euc-kr'=>1,
00164 'shift_jis'=>1,
00165 );
00166
00167
00168
00169 var $synonyms=array(
00170 'us' => 'ascii',
00171 'us-ascii'=> 'ascii',
00172 'cp819' => 'iso-8859-1',
00173 'ibm819' => 'iso-8859-1',
00174 'iso-ir-100' => 'iso-8859-1',
00175 'iso-ir-109' => 'iso-8859-2',
00176 'iso-ir-148' => 'iso-8859-9',
00177 'iso-ir-199' => 'iso-8859-14',
00178 'iso-ir-203' => 'iso-8859-15',
00179 'csisolatin1' => 'iso-8859-1',
00180 'csisolatin2' => 'iso-8859-2',
00181 'csisolatin3' => 'iso-8859-3',
00182 'csisolatin5' => 'iso-8859-9',
00183 'csisolatin8' => 'iso-8859-14',
00184 'csisolatin9' => 'iso-8859-15',
00185 'csisolatingreek' => 'iso-8859-7',
00186 'iso-celtic' => 'iso-8859-14',
00187 'latin1' => 'iso-8859-1',
00188 'latin2' => 'iso-8859-2',
00189 'latin3' => 'iso-8859-3',
00190 'latin5' => 'iso-8859-9',
00191 'latin6' => 'iso-8859-10',
00192 'latin8' => 'iso-8859-14',
00193 'latin9' => 'iso-8859-15',
00194 'l1' => 'iso-8859-1',
00195 'l2' => 'iso-8859-2',
00196 'l3' => 'iso-8859-3',
00197 'l5' => 'iso-8859-9',
00198 'l6' => 'iso-8859-10',
00199 'l8' => 'iso-8859-14',
00200 'l9' => 'iso-8859-15',
00201 'cyrillic' => 'iso-8859-5',
00202 'arabic' => 'iso-8859-6',
00203 'tis-620' => 'iso-8859-11',
00204 'win874' => 'windows-874',
00205 'win1250' => 'windows-1250',
00206 'win1251' => 'windows-1251',
00207 'win1252' => 'windows-1252',
00208 'win1253' => 'windows-1253',
00209 'win1254' => 'windows-1254',
00210 'win1255' => 'windows-1255',
00211 'win1256' => 'windows-1256',
00212 'win1257' => 'windows-1257',
00213 'win1258' => 'windows-1258',
00214 'cp1250' => 'windows-1250',
00215 'cp1251' => 'windows-1251',
00216 'cp1252' => 'windows-1252',
00217 'ms-ee' => 'windows-1250',
00218 'ms-ansi' => 'windows-1252',
00219 'ms-greek' => 'windows-1253',
00220 'ms-turk' => 'windows-1254',
00221 'winbaltrim' => 'windows-1257',
00222 'koi-8ru' => 'koi-8r',
00223 'koi8r' => 'koi-8r',
00224 'cp878' => 'koi-8r',
00225 'mac' => 'macroman',
00226 'macintosh' => 'macroman',
00227 'euc-cn' => 'gb2312',
00228 'x-euc-cn' => 'gb2312',
00229 'euccn' => 'gb2312',
00230 'cp936' => 'gb2312',
00231 'big-5' => 'big5',
00232 'cp950' => 'big5',
00233 'eucjp' => 'euc-jp',
00234 'sjis' => 'shift_jis',
00235 'shift-jis' => 'shift_jis',
00236 'cp932' => 'shift_jis',
00237 'cp949' => 'euc-kr',
00238 'utf7' => 'utf-7',
00239 'utf8' => 'utf-8',
00240 'utf16' => 'utf-16',
00241 'utf32' => 'utf-32',
00242 'utf8' => 'utf-8',
00243 'ucs2' => 'ucs-2',
00244 'ucs4' => 'ucs-4',
00245 );
00246
00247
00248 var $lang_to_script=array(
00249
00250
00251
00252
00253 'ar' => 'arabic',
00254 'bg' => 'cyrillic',
00255 'bs' => 'east_european',
00256 'cs' => 'east_european',
00257 'da' => 'west_european',
00258 'de' => 'west_european',
00259 'es' => 'west_european',
00260 'et' => 'estonian',
00261 'eo' => 'unicode',
00262 'eu' => 'west_european',
00263 'fa' => 'arabic',
00264 'fi' => 'west_european',
00265 'fo' => 'west_european',
00266 'fr' => 'west_european',
00267 'gr' => 'greek',
00268 'ge' => 'unicode',
00269 'he' => 'hebrew',
00270 'hi' => 'unicode',
00271 'hr' => 'east_european',
00272 'hu' => 'east_european',
00273 'iw' => 'hebrew',
00274 'is' => 'west_european',
00275 'it' => 'west_european',
00276 'ja' => 'japanese',
00277 'kl' => 'west_european',
00278 'ko' => 'korean',
00279 'lt' => 'lithuanian',
00280 'lv' => 'west_european',
00281 'nl' => 'west_european',
00282 'no' => 'west_european',
00283 'pl' => 'east_european',
00284 'pt' => 'west_european',
00285 'ro' => 'east_european',
00286 'ru' => 'cyrillic',
00287 'sk' => 'east_european',
00288 'sl' => 'east_european',
00289 'sr' => 'cyrillic',
00290 'sv' => 'west_european',
00291 'sq' => 'albanian',
00292 'th' => 'thai',
00293 'uk' => 'cyrillic',
00294 'vi' => 'vietnamese',
00295 'zh' => 'chinese',
00296
00297
00298 'ara' => 'arabic',
00299 'bgr' => 'cyrillic',
00300 'cat' => 'west_european',
00301 'chs' => 'simpl_chinese',
00302 'cht' => 'trad_chinese',
00303 'csy' => 'east_european',
00304 'dan' => 'west_european',
00305 'deu' => 'west_european',
00306 'dea' => 'west_european',
00307 'des' => 'west_european',
00308 'ena' => 'west_european',
00309 'enc' => 'west_european',
00310 'eng' => 'west_european',
00311 'enz' => 'west_european',
00312 'enu' => 'west_european',
00313 'euq' => 'west_european',
00314 'fos' => 'west_european',
00315 'far' => 'arabic',
00316 'fin' => 'west_european',
00317 'fra' => 'west_european',
00318 'frb' => 'west_european',
00319 'frc' => 'west_european',
00320 'frs' => 'west_european',
00321 'geo' => 'unicode',
00322 'ell' => 'greek',
00323 'heb' => 'hebrew',
00324 'hin' => 'unicode',
00325 'hun' => 'east_european',
00326 'isl' => 'west_euorpean',
00327 'ita' => 'west_european',
00328 'its' => 'west_european',
00329 'jpn' => 'japanese',
00330 'kor' => 'korean',
00331 'lth' => 'lithuanian',
00332 'lvi' => 'west_european',
00333 'msl' => 'west_european',
00334 'nlb' => 'west_european',
00335 'nld' => 'west_european',
00336 'nor' => 'west_european',
00337 'non' => 'west_european',
00338 'plk' => 'east_european',
00339 'ptg' => 'west_european',
00340 'ptb' => 'west_european',
00341 'rom' => 'east_european',
00342 'rus' => 'cyrillic',
00343 'slv' => 'east_european',
00344 'sky' => 'east_european',
00345 'srl' => 'east_european',
00346 'srb' => 'cyrillic',
00347 'esp' => 'west_european',
00348 'esm' => 'west_european',
00349 'esn' => 'west_european',
00350 'sve' => 'west_european',
00351 'sqi' => 'albanian',
00352 'tha' => 'thai',
00353 'trk' => 'turkish',
00354 'ukr' => 'cyrillic',
00355
00356 'albanian' => 'albanian',
00357 'arabic' => 'arabic',
00358 'basque' => 'west_european',
00359 'bosnian' => 'east_european',
00360 'bulgarian' => 'east_european',
00361 'catalan' => 'west_european',
00362 'croatian' => 'east_european',
00363 'czech' => 'east_european',
00364 'danish' => 'west_european',
00365 'dutch' => 'west_european',
00366 'english' => 'west_european',
00367 'esperanto' => 'unicode',
00368 'estonian' => 'estonian',
00369 'faroese' => 'west_european',
00370 'farsi' => 'arabic',
00371 'finnish' => 'west_european',
00372 'french' => 'west_european',
00373 'galician' => 'west_european',
00374 'georgian' => 'unicode',
00375 'german' => 'west_european',
00376 'greek' => 'greek',
00377 'greenlandic' => 'west_european',
00378 'hebrew' => 'hebrew',
00379 'hindi' => 'unicode',
00380 'hungarian' => 'east_european',
00381 'icelandic' => 'west_european',
00382 'italian' => 'west_european',
00383 'latvian' => 'west_european',
00384 'lettish' => 'west_european',
00385 'lithuanian' => 'lithuanian',
00386 'malay' => 'west_european',
00387 'norwegian' => 'west_european',
00388 'persian' => 'arabic',
00389 'polish' => 'east_european',
00390 'portuguese' => 'west_european',
00391 'russian' => 'cyrillic',
00392 'romanian' => 'east_european',
00393 'serbian' => 'cyrillic',
00394 'slovak' => 'east_european',
00395 'slovenian' => 'east_european',
00396 'spanish' => 'west_european',
00397 'svedish' => 'west_european',
00398 'that' => 'thai',
00399 'turkish' => 'turkish',
00400 'ukrainian' => 'cyrillic',
00401 );
00402
00403
00404 var $script_to_charset_unix=array(
00405 'west_european' => 'iso-8859-1',
00406 'estonian' => 'iso-8859-1',
00407 'east_european' => 'iso-8859-2',
00408 'baltic' => 'iso-8859-4',
00409 'cyrillic' => 'iso-8859-5',
00410 'arabic' => 'iso-8859-6',
00411 'greek' => 'iso-8859-7',
00412 'hebrew' => 'iso-8859-8',
00413 'turkish' => 'iso-8859-9',
00414 'thai' => 'iso-8859-11',
00415 'lithuanian' => 'iso-8859-13',
00416 'chinese' => 'gb2312',
00417 'japanese' => 'euc-jp',
00418 'korean' => 'euc-kr',
00419 'simpl_chinese' => 'gb2312',
00420 'trad_chinese' => 'big5',
00421 'vietnamese' => '',
00422 'unicode' => 'utf-8',
00423 'albanian' => 'utf-8'
00424 );
00425
00426
00427 var $script_to_charset_windows=array(
00428 'east_european' => 'windows-1250',
00429 'cyrillic' => 'windows-1251',
00430 'west_european' => 'windows-1252',
00431 'greek' => 'windows-1253',
00432 'turkish' => 'windows-1254',
00433 'hebrew' => 'windows-1255',
00434 'arabic' => 'windows-1256',
00435 'baltic' => 'windows-1257',
00436 'estonian' => 'windows-1257',
00437 'lithuanian' => 'windows-1257',
00438 'vietnamese' => 'windows-1258',
00439 'thai' => 'cp874',
00440 'korean' => 'cp949',
00441 'chinese' => 'gb2312',
00442 'japanese' => 'shift_jis',
00443 'simpl_chinese' => 'gb2312',
00444 'trad_chinese' => 'big5',
00445 'albanian' => 'windows-1250',
00446 'unicode' => 'utf-8'
00447 );
00448
00449
00450 var $locale_to_charset=array(
00451 'japanese.euc' => 'euc-jp',
00452 'ja_jp.ujis' => 'euc-jp',
00453 'korean.euc' => 'euc-kr',
00454 'sr@Latn' => 'iso-8859-2',
00455 'zh_cn' => 'gb2312',
00456 'zh_hk' => 'big5',
00457 'zh_tw' => 'big5',
00458 );
00459
00460
00461
00462 var $charSetArray = array(
00463 'dk' => '',
00464 'de' => '',
00465 'no' => '',
00466 'it' => '',
00467 'fr' => '',
00468 'es' => '',
00469 'nl' => '',
00470 'cz' => 'windows-1250',
00471 'pl' => 'iso-8859-2',
00472 'si' => 'windows-1250',
00473 'fi' => '',
00474 'tr' => 'iso-8859-9',
00475 'se' => '',
00476 'pt' => '',
00477 'ru' => 'windows-1251',
00478 'ro' => 'iso-8859-2',
00479 'ch' => 'gb2312',
00480 'sk' => 'windows-1250',
00481 'lt' => 'windows-1257',
00482 'is' => 'utf-8',
00483 'hr' => 'windows-1250',
00484 'hu' => 'iso-8859-2',
00485 'gl' => '',
00486 'th' => 'iso-8859-11',
00487 'gr' => 'iso-8859-7',
00488 'hk' => 'big5',
00489 'eu' => '',
00490 'bg' => 'windows-1251',
00491 'br' => '',
00492 'et' => 'iso-8859-4',
00493 'ar' => 'iso-8859-6',
00494 'he' => 'utf-8',
00495 'ua' => 'windows-1251',
00496 'jp' => 'shift_jis',
00497 'lv' => 'utf-8',
00498 'vn' => 'utf-8',
00499 'ca' => 'iso-8859-15',
00500 'ba' => 'iso-8859-2',
00501 'kr' => 'euc-kr',
00502 'eo' => 'utf-8',
00503 'my' => '',
00504 'hi' => 'utf-8',
00505 'fo' => 'utf-8',
00506 'fa' => 'utf-8',
00507 'sr' => 'utf-8',
00508 'sq' => 'utf-8',
00509 'ge' => 'utf-8'
00510 );
00511
00512
00513
00514 var $isoArray = array(
00515 'ba' => 'bs',
00516 'br' => 'pt_BR',
00517 'ch' => 'zh_CN',
00518 'cz' => 'cs',
00519 'dk' => 'da',
00520 'si' => 'sl',
00521 'se' => 'sv',
00522 'gl' => 'kl',
00523 'gr' => 'el',
00524 'hk' => 'zh_HK',
00525 'kr' => 'ko',
00526 'ua' => 'uk',
00527 'jp' => 'ja',
00528 'vn' => 'vi',
00529 );
00530
00538 function parse_charset($charset) {
00539 $charset = trim(strtolower($charset));
00540 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
00541
00542 return $charset;
00543 }
00544
00557 function get_locale_charset($locale) {
00558 $locale = strtolower($locale);
00559
00560
00561 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
00562
00563
00564 list($locale,$modifier) = explode('@',$locale);
00565
00566
00567 list($locale,$charset) = explode('.',$locale);
00568 if ($charset) return $this->parse_charset($charset);
00569
00570
00571 if ($modifier == 'euro') return 'iso-8859-15';
00572
00573
00574 list($language,$country) = explode('_',$locale);
00575 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
00576
00577 if (TYPO3_OS == 'WIN') {
00578 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
00579 } else {
00580 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00581 }
00582
00583 return $cs;
00584 }
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00610 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00611 if ($fromCS==$toCS) return $str;
00612
00613
00614 if ($toCS=='utf-8' || !$useEntityForNoChar) {
00615 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00616 case 'mbstring':
00617 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00618 if (false !== $conv_str) return $conv_str;
00619 break;
00620
00621 case 'iconv':
00622 $conv_str = iconv($fromCS,$toCS.'
00623 if (false !== $conv_str) return $conv_str;
00624 break;
00625
00626 case 'recode':
00627 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00628 if (false !== $conv_str) return $conv_str;
00629 break;
00630 }
00631
00632 }
00633
00634 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
00635 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00636 return $str;
00637 }
00638
00650 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00651 foreach($array as $key => $value) {
00652 if (is_array($array[$key])) {
00653 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00654 } else {
00655 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00656 }
00657 }
00658 }
00659
00667 function utf8_encode($str,$charset) {
00668
00669 if ($charset === 'utf-8') return $str;
00670
00671
00672 if ($this->initCharset($charset)) {
00673 $strLen = strlen($str);
00674 $outStr='';
00675
00676 for ($a=0;$a<$strLen;$a++) {
00677 $chr=substr($str,$a,1);
00678 $ord=ord($chr);
00679 if (isset($this->twoByteSets[$charset])) {
00680 $ord2 = ord($str{$a+1});
00681 $ord = $ord<<8 | $ord2;
00682
00683 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00684 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00685 } else $outStr.=chr($this->noCharByteVal);
00686 $a++;
00687 } elseif ($ord>127) {
00688 if (isset($this->eucBasedSets[$charset])) {
00689 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) {
00690 $a++;
00691 $ord2=ord(substr($str,$a,1));
00692 $ord = $ord*256+$ord2;
00693 }
00694 }
00695
00696 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00697 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00698 } else $outStr.= chr($this->noCharByteVal);
00699 } else $outStr.= $chr;
00700 }
00701 return $outStr;
00702 }
00703 }
00704
00713 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
00714
00715
00716 if ($this->initCharset($charset)) {
00717 $strLen = strlen($str);
00718 $outStr='';
00719 $buf='';
00720 for ($a=0,$i=0;$a<$strLen;$a++,$i++) {
00721 $chr=substr($str,$a,1);
00722 $ord=ord($chr);
00723 if ($ord>127) {
00724 if ($ord & 64) {
00725
00726 $buf=$chr;
00727 for ($b=0;$b<8;$b++) {
00728 $ord = $ord << 1;
00729 if ($ord & 128) {
00730 $a++;
00731 $buf.=substr($str,$a,1);
00732 } else break;
00733 }
00734
00735 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
00736 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
00737 if ($mByte>255) {
00738 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00739 } else $outStr.= chr($mByte);
00740 } elseif ($useEntityForNoChar) {
00741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00742 } else $outStr.=chr($this->noCharByteVal);
00743 } else $outStr.=chr($this->noCharByteVal);
00744 } else $outStr.=$chr;
00745 }
00746 return $outStr;
00747 }
00748 }
00749
00756 function utf8_to_entities($str) {
00757 $strLen = strlen($str);
00758 $outStr='';
00759 $buf='';
00760 for ($a=0;$a<$strLen;$a++) {
00761 $chr=substr($str,$a,1);
00762 $ord=ord($chr);
00763 if ($ord>127) {
00764 if ($ord & 64) {
00765 $buf=$chr;
00766 for ($b=0;$b<8;$b++) {
00767 $ord = $ord << 1;
00768 if ($ord & 128) {
00769 $a++;
00770 $buf.=substr($str,$a,1);
00771 } else break;
00772 }
00773
00774 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00775 } else $outStr.=chr($this->noCharByteVal);
00776 } else $outStr.=$chr;
00777 }
00778
00779 return $outStr;
00780 }
00781
00789 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
00790 if ($alsoStdHtmlEnt) {
00791 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
00792 }
00793
00794 $token = md5(microtime());
00795 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00796 foreach($parts as $k => $v) {
00797 if ($k%2) {
00798 if (substr($v,0,1)=='#') {
00799 if (substr($v,1,1)=='x') {
00800 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00801 } else {
00802 $parts[$k] = $this->UnumberToChar(substr($v,1));
00803 }
00804 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {
00805 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00806 } else {
00807 $parts[$k] ='&'.$v.';';
00808 }
00809 }
00810 }
00811
00812 return implode('',$parts);
00813 }
00814
00823 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
00824
00825 if ($convEntities) {
00826 $str = $this->entities_to_utf8($str,1);
00827 }
00828
00829 $strLen = strlen($str);
00830 $outArr=array();
00831 $buf='';
00832 for ($a=0;$a<$strLen;$a++) {
00833 $chr=substr($str,$a,1);
00834 $ord=ord($chr);
00835 if ($ord>127) {
00836 if ($ord & 64) {
00837 $buf=$chr;
00838 for ($b=0;$b<8;$b++) {
00839 $ord = $ord << 1;
00840 if ($ord & 128) {
00841 $a++;
00842 $buf.=substr($str,$a,1);
00843 } else break;
00844 }
00845
00846 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00847 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;
00848 } else $outArr[]=$retChar?chr($ord):$ord;
00849 }
00850
00851 return $outArr;
00852 }
00853
00873 function UnumberToChar($cbyte) {
00874 $str='';
00875
00876 if ($cbyte < 0x80) {
00877 $str.=chr($cbyte);
00878 } else if ($cbyte < 0x800) {
00879 $str.=chr(0xC0 | ($cbyte >> 6));
00880 $str.=chr(0x80 | ($cbyte & 0x3F));
00881 } else if ($cbyte < 0x10000) {
00882 $str.=chr(0xE0 | ($cbyte >> 12));
00883 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00884 $str.=chr(0x80 | ($cbyte & 0x3F));
00885 } else if ($cbyte < 0x200000) {
00886 $str.=chr(0xF0 | ($cbyte >> 18));
00887 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00888 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00889 $str.=chr(0x80 | ($cbyte & 0x3F));
00890 } else if ($cbyte < 0x4000000) {
00891 $str.=chr(0xF8 | ($cbyte >> 24));
00892 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00893 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00894 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00895 $str.=chr(0x80 | ($cbyte & 0x3F));
00896 } else if ($cbyte < 0x80000000) {
00897 $str.=chr(0xFC | ($cbyte >> 30));
00898 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00899 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00900 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00901 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00902 $str.=chr(0x80 | ($cbyte & 0x3F));
00903 } else {
00904 $str .= chr($this->noCharByteVal);
00905 }
00906 return $str;
00907 }
00908
00918 function utf8CharToUnumber($str,$hex=0) {
00919 $ord=ord(substr($str,0,1));
00920
00921 if (($ord & 192) == 192) {
00922 $binBuf='';
00923 for ($b=0;$b<8;$b++) {
00924 $ord = $ord << 1;
00925 if ($ord & 128) {
00926 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00927 } else break;
00928 }
00929 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00930
00931 $int = bindec($binBuf);
00932 } else $int = $ord;
00933
00934 return $hex ? 'x'.dechex($int) : $int;
00935 }
00936
00937
00938
00939
00940
00941
00942
00943
00944
00945
00946
00947
00948
00949
00950
00961 function initCharset($charset) {
00962
00963 if (!is_array($this->parsedCharsets[$charset])) {
00964
00965
00966 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00967
00968
00969 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
00970
00971
00972 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00973 if ($cacheFile && @is_file($cacheFile)) {
00974 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00975 } else {
00976
00977 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00978
00979 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00980
00981 $detectedType='';
00982 foreach($lines as $value) {
00983 if (trim($value) && substr($value,0,1)!='#') {
00984
00985
00986
00987 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00988
00989 if ($detectedType=='ms-token') {
00990 list($hexbyte,$utf8) = split('=|:',$value,3);
00991 } elseif ($detectedType=='whitespaced') {
00992 $regA=array();
00993 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00994 $hexbyte = $regA[1];
00995 $utf8 = 'U+'.$regA[2];
00996 }
00997 $decval = hexdec(trim($hexbyte));
00998 if ($decval>127) {
00999 $utf8decval = hexdec(substr(trim($utf8),2));
01000 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
01001 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
01002 }
01003 }
01004 }
01005 if ($cacheFile) {
01006 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
01007 }
01008 }
01009 return 2;
01010 } else return false;
01011 } else return 1;
01012 }
01013
01023 function initUnicodeData($mode=null) {
01024
01025 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01026 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01027
01028
01029 switch($mode) {
01030 case 'case':
01031 if (is_array($this->caseFolding['utf-8'])) return 1;
01032
01033
01034 if ($cacheFileCase && @is_file($cacheFileCase)) {
01035 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01036 return 2;
01037 }
01038 break;
01039
01040 case 'ascii':
01041 if (is_array($this->toASCII['utf-8'])) return 1;
01042
01043
01044 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
01045 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01046 return 2;
01047 }
01048 break;
01049 }
01050
01051
01052 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01053 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01054
01055 $fh = fopen($unicodeDataFile,'rb');
01056 if (!$fh) return false;
01057
01058
01059
01060 $this->caseFolding['utf-8'] = array();
01061 $utf8CaseFolding =& $this->caseFolding['utf-8'];
01062 $utf8CaseFolding['toUpper'] = array();
01063 $utf8CaseFolding['toLower'] = array();
01064 $utf8CaseFolding['toTitle'] = array();
01065
01066 $decomposition = array();
01067 $mark = array();
01068 $number = array();
01069 $omit = array();
01070
01071 while (!feof($fh)) {
01072 $line = fgets($fh,4096);
01073
01074 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01075
01076 $ord = hexdec($char);
01077 if ($ord > 0xFFFF) break;
01078
01079 $utf8_char = $this->UnumberToChar($ord);
01080
01081 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01082 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01083
01084 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01085
01086 switch ($cat{0}) {
01087 case 'M':
01088 $mark["U+$char"] = 1;
01089 break;
01090
01091 case 'N':
01092 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
01093 }
01094
01095
01096 $match = array();
01097 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
01098 $c = ord($match[2]);
01099 if ($match[1] == 'SMALL') $c += 32;
01100
01101 $decomposition["U+$char"] = array(dechex($c));
01102 continue;
01103 }
01104
01105 $match = array();
01106 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
01107 switch($match[1]) {
01108 case '<circle>':
01109 $match[2] = '0028 '.$match[2].' 0029';
01110 break;
01111
01112 case '<square>':
01113 $match[2] = '005B '.$match[2].' 005D';
01114 break;
01115
01116 case '<compat>':
01117 if (ereg('^0020 ',$match[2])) continue 2;
01118 break;
01119
01120
01121 case '<initial>':
01122 case '<medial>':
01123 case '<final>':
01124 case '<isolated>':
01125 case '<vertical>':
01126 continue 2;
01127 }
01128 $decomposition["U+$char"] = split(' ',$match[2]);
01129 }
01130 }
01131 fclose($fh);
01132
01133
01134 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01135 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01136 $fh = fopen($specialCasingFile,'rb');
01137 if ($fh) {
01138 while (!feof($fh)) {
01139 $line = fgets($fh,4096);
01140 if ($line{0} != '#' && trim($line) != '') {
01141
01142 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01143 if ($cond == '' || $cond{0} == '#') {
01144 $utf8_char = $this->UnumberToChar(hexdec($char));
01145 if ($char != $lower) {
01146 $arr = split(' ',$lower);
01147 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01148 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01149 }
01150 if ($char != $title && $title != $upper) {
01151 $arr = split(' ',$title);
01152 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01153 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01154 }
01155 if ($char != $upper) {
01156 $arr = split(' ',$upper);
01157 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01158 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01159 }
01160 }
01161 }
01162 }
01163 fclose($fh);
01164 }
01165 }
01166
01167
01168 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01169 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
01170 $fh = fopen($customTranslitFile,'rb');
01171 if ($fh) {
01172 while (!feof($fh)) {
01173 $line = fgets($fh,4096);
01174 if ($line{0} != '#' && trim($line) != '') {
01175 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01176 if (!$translit) $omit["U+$char"] = 1;
01177 $decomposition["U+$char"] = split(' ', $translit);
01178
01179 }
01180 }
01181 fclose($fh);
01182 }
01183 }
01184
01185
01186 foreach($decomposition as $from => $to) {
01187 $code_decomp = array();
01188
01189 while ($code_value = array_shift($to)) {
01190 if (isset($decomposition["U+$code_value"])) {
01191 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
01192 array_unshift($to, $cv);
01193 }
01194 } elseif (!isset($mark["U+$code_value"])) {
01195 array_push($code_decomp, $code_value);
01196 }
01197 }
01198 if (count($code_decomp) || isset($omit[$from])) {
01199 $decomposition[$from] = $code_decomp;
01200 } else {
01201 unset($decomposition[$from]);
01202 }
01203 }
01204
01205
01206 $this->toASCII['utf-8'] = array();
01207 $ascii =& $this->toASCII['utf-8'];
01208
01209 foreach($decomposition as $from => $to) {
01210 $code_decomp = array();
01211 while ($code_value = array_shift($to)) {
01212 $ord = hexdec($code_value);
01213 if ($ord > 127)
01214 continue 2;
01215 else
01216 array_push($code_decomp,chr($ord));
01217 }
01218 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01219 }
01220
01221
01222 foreach($number as $from => $to) {
01223 $utf8_char = $this->UnumberToChar(hexdec($from));
01224 if (!isset($ascii[$utf8_char])) {
01225 $ascii[$utf8_char] = $to;
01226 }
01227 }
01228
01229 if ($cacheFileCase) {
01230 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01231 }
01232
01233 if ($cacheFileASCII) {
01234 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01235 }
01236
01237 return 3;
01238 }
01239
01248 function initCaseFolding($charset) {
01249
01250 if (is_array($this->caseFolding[$charset])) return 1;
01251
01252
01253 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01254 if ($cacheFile && @is_file($cacheFile)) {
01255 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01256 return 2;
01257 }
01258
01259
01260 if (!$this->initCharset($charset)) {
01261 return false;
01262 }
01263
01264
01265 if (!$this->initUnicodeData('case')) {
01266 return false;
01267 }
01268
01269 $nochar = chr($this->noCharByteVal);
01270 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01271
01272 $c = $this->utf8_decode($utf8, $charset);
01273
01274
01275 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01276 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01277
01278
01279 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01280 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
01281
01282
01283 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01284 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01285 }
01286
01287
01288 for ($i=ord('a'); $i<=ord('z'); $i++) {
01289 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01290 }
01291 for ($i=ord('A'); $i<=ord('Z'); $i++) {
01292 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01293 }
01294
01295 if ($cacheFile) {
01296 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01297 }
01298
01299 return 3;
01300 }
01301
01310 function initToASCII($charset) {
01311
01312 if (is_array($this->toASCII[$charset])) return 1;
01313
01314
01315 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01316 if ($cacheFile && @is_file($cacheFile)) {
01317 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01318 return 2;
01319 }
01320
01321
01322 if (!$this->initCharset($charset)) {
01323 return false;
01324 }
01325
01326
01327 if (!$this->initUnicodeData('ascii')) {
01328 return false;
01329 }
01330
01331 $nochar = chr($this->noCharByteVal);
01332 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01333
01334 $c = $this->utf8_decode($utf8, $charset);
01335
01336 if (isset($this->toASCII['utf-8'][$utf8])) {
01337 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01338 }
01339 }
01340
01341 if ($cacheFile) {
01342 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01343 }
01344
01345 return 3;
01346 }
01347
01348
01349
01350
01351
01352
01353
01354
01355
01356
01357
01358
01359
01360
01361
01362
01363
01364
01365
01366
01367
01368
01381 function substr($charset,$string,$start,$len=null) {
01382 if ($len===0) return '';
01383
01384 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01385
01386 if ($len==null) {
01387 $enc = mb_internal_encoding();
01388 mb_internal_encoding($charset);
01389 $str = mb_substr($string,$start);
01390 mb_internal_encoding($enc);
01391
01392 return $str;
01393 }
01394 else {
01395 return mb_substr($string,$start,$len,$charset);
01396 }
01397 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01398
01399 if ($len==null) {
01400 $enc = iconv_get_encoding('internal_encoding');
01401 iconv_set_encoding('internal_encoding',$charset);
01402 $str = iconv_substr($string,$start);
01403 iconv_set_encoding('internal_encoding',$enc);
01404
01405 return $str;
01406 }
01407 else {
01408 return iconv_substr($string,$start,$len,$charset);
01409 }
01410 } elseif ($charset == 'utf-8') {
01411 return $this->utf8_substr($string,$start,$len);
01412 } elseif ($this->eucBasedSets[$charset]) {
01413 return $this->euc_substr($string,$start,$charset,$len);
01414 } elseif ($this->twoByteSets[$charset]) {
01415 return substr($string,$start*2,$len*2);
01416 } elseif ($this->fourByteSets[$charset]) {
01417 return substr($string,$start*4,$len*4);
01418 }
01419
01420
01421 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01422 }
01423
01434 function strlen($charset,$string) {
01435 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01436 return mb_strlen($string,$charset);
01437 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01438 return iconv_strlen($string,$charset);
01439 } elseif ($charset == 'utf-8') {
01440 return $this->utf8_strlen($string);
01441 } elseif ($this->eucBasedSets[$charset]) {
01442 return $this->euc_strlen($string,$charset);
01443 } elseif ($this->twoByteSets[$charset]) {
01444 return strlen($string)/2;
01445 } elseif ($this->fourByteSets[$charset]) {
01446 return strlen($string)/4;
01447 }
01448
01449 return strlen($string);
01450 }
01451
01464 function crop($charset,$string,$len,$crop='') {
01465 if (intval($len) == 0) return $string;
01466
01467 if ($charset == 'utf-8') {
01468 $i = $this->utf8_char2byte_pos($string,$len);
01469 } elseif ($this->eucBasedSets[$charset]) {
01470 $i = $this->euc_char2byte_pos($string,$len,$charset);
01471 } else {
01472 if ($len > 0) {
01473 $i = $len;
01474 } else {
01475 $i = strlen($string)+$len;
01476 if ($i<=0) $i = false;
01477 }
01478 }
01479
01480 if ($i === false) {
01481 return $string;
01482 } else {
01483