andi_b: japanischen sonderzeichen in iso-2022-jp umwandeln

Beitrag lesen

alter!
das war ein drama...
zunächst ging ja das iconv modul nicht (https://forum.selfhtml.org/?t=87524&m=520442)

nachdem das endlich gelöst war, kam ein php-bug:
cannot yet handle MBCS in html_entity_decode()
http://bugs.php.net/bug.php?id=25670

die lösung dort funktioniert aber leider nicht. zu guterletzt habe ich im php-manual einen kommentar gefunden mit der passenden funktion (drittletzter, ronen at greyzone dot com):
http://de3.php.net/manual/de/function.utf8-encode.php.

das ganze ist jetzt so gedacht, dass ich meine japanische datei aus dem word in htm speichere, dann wird via script der ganze <span> mist entfernt, dann wird mit folgender klasse das dokument gewandelt. nun muss man nur noch den header auf:
<html lang="jp">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
<meta http-equiv="content-language" content="jp">

setzen. dann gehts :)

viele grüsse, andi

ps: ich weiss, dass die funktion sher unvorteilhaft eingebunden ist, ist mir aber momentan nicht so wichtig ;)

class ConvertToJapaneseUTF
{
  var $writer;

function ConvertToJapaneseUTF()
  {
    global $directoryDelimiter;
    $this->delimitter = $directoryDelimiter;
    $this->writer = & new FileWrite();
  }

function processFile($myFile)
  {
    $pattern = '=&#[0-9]{4,5};=';

if(!$fileContent = file($myFile))
    {
      trigger_error('Error reading: '.$myFile, E_USER_ERROR);
    }
    else
    {
      $fileProcessed = false;

if(!$this->silent)
      {
        echo $myFile."\r\n";
      }
      for($i=0; $i < count($fileContent); ++$i)
      {
        $tmp = $fileContent[$i];
        $fileContent[$i] = preg_replace_callback(
                                        $pattern,
                                        create_function(
                                                        '$match',
                                                        '
                                                        $utf = ConvertToJapaneseUTF::utf8Encode($match[0]);
                                                        $out = iconv("UTF-8", "ISO-2022-JP", $utf);
                                                        echo $match[0].": $utf: $out\r\n";
                                                        return $out;
                                                        '),
                                        $fileContent[$i]);
      if($tmp != $fileContent[$i])
        {
          $fileProcessed = true;
        }
      }
      if($fileProcessed)
      {
        $this->writer->write($myFile, implode($fileContent));
      }
    }
  }

function utf8Encode ($source) {
    $utf8Str = '';
    $entityArray = explode ("&#", $source);
    $size = count ($entityArray);
    for ($i = 0; $i < $size; $i++) {
      $subStr = $entityArray[$i];
      $nonEntity = strstr ($subStr, ';');
      if ($nonEntity !== false) {
        $unicode = intval (substr ($subStr, 0, (strpos ($subStr, ';') + 1)));
        // determine how many chars are needed to reprsent this unicode char
        if ($unicode < 128) {
          $utf8Substring = chr ($unicode);
        }
        else if ($unicode >= 128 && $unicode < 2048) {
          $binVal = str_pad (decbin ($unicode), 11, "0", STR_PAD_LEFT);
          $binPart1 = substr ($binVal, 0, 5);
          $binPart2 = substr ($binVal, 5);

$char1 = chr (192 + bindec ($binPart1));
          $char2 = chr (128 + bindec ($binPart2));
          $utf8Substring = $char1 . $char2;
        }
        else if ($unicode >= 2048 && $unicode < 65536) {
          $binVal = str_pad (decbin ($unicode), 16, "0", STR_PAD_LEFT);
          $binPart1 = substr ($binVal, 0, 4);
          $binPart2 = substr ($binVal, 4, 6);
          $binPart3 = substr ($binVal, 10);

$char1 = chr (224 + bindec ($binPart1));
          $char2 = chr (128 + bindec ($binPart2));
          $char3 = chr (128 + bindec ($binPart3));
          $utf8Substring = $char1 . $char2 . $char3;
        }
        else {
          $binVal = str_pad (decbin ($unicode), 21, "0", STR_PAD_LEFT);
          $binPart1 = substr ($binVal, 0, 3);
          $binPart2 = substr ($binVal, 3, 6);
          $binPart3 = substr ($binVal, 9, 6);
          $binPart4 = substr ($binVal, 15);

$char1 = chr (240 + bindec ($binPart1));
          $char2 = chr (128 + bindec ($binPart2));
          $char3 = chr (128 + bindec ($binPart3));
          $char4 = chr (128 + bindec ($binPart4));
          $utf8Substring = $char1 . $char2 . $char3 . $char4;
        }

if (strlen ($nonEntity) > 1)
        $nonEntity = substr ($nonEntity, 1); // chop the first char (';')
        else
        $nonEntity = '';

$utf8Str .= $utf8Substring . $nonEntity;
      }
      else {
        $utf8Str .= $subStr;
      }
    }
    return $utf8Str;
  }
}