From 34bc91a1553ad140bbf4c74761aeb948e738b44b Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Sat, 2 Nov 2024 22:53:47 +0100 Subject: [PATCH] missing modules from IZI --- lib/perl/NB/Encoding/FixLatin.pm | 415 +++++++++++++++++++++ lib/perl/NB/Encoding/big5.enc | Bin 0 -> 40706 bytes lib/perl/NB/Encoding/euc-kr.enc | Bin 0 -> 45802 bytes lib/perl/NB/Encoding/iso-8859-15.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-2.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-3.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-4.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-5.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-7.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-8.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/iso-8859-9.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/windows-1250.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/windows-1252.enc | Bin 0 -> 1072 bytes lib/perl/NB/Encoding/x-euc-jp-jisx0221.enc | Bin 0 -> 37890 bytes lib/perl/NB/Encoding/x-euc-jp-unicode.enc | Bin 0 -> 37890 bytes lib/perl/NB/Encoding/x-sjis-cp932.enc | Bin 0 -> 20368 bytes lib/perl/NB/Encoding/x-sjis-jdk117.enc | Bin 0 -> 18202 bytes lib/perl/NB/Encoding/x-sjis-jisx0221.enc | Bin 0 -> 18202 bytes lib/perl/NB/Encoding/x-sjis-unicode.enc | Bin 0 -> 18202 bytes lib/perl/NB/Math/BaseCnv.pm | 401 ++++++++++++++++++++ 20 files changed, 816 insertions(+) create mode 100644 lib/perl/NB/Encoding/FixLatin.pm create mode 100644 lib/perl/NB/Encoding/big5.enc create mode 100644 lib/perl/NB/Encoding/euc-kr.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-15.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-2.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-3.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-4.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-5.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-7.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-8.enc create mode 100644 lib/perl/NB/Encoding/iso-8859-9.enc create mode 100644 lib/perl/NB/Encoding/windows-1250.enc create mode 100644 lib/perl/NB/Encoding/windows-1252.enc create mode 100644 lib/perl/NB/Encoding/x-euc-jp-jisx0221.enc create mode 100644 lib/perl/NB/Encoding/x-euc-jp-unicode.enc create mode 100644 lib/perl/NB/Encoding/x-sjis-cp932.enc create mode 100644 lib/perl/NB/Encoding/x-sjis-jdk117.enc create mode 100644 lib/perl/NB/Encoding/x-sjis-jisx0221.enc create mode 100644 lib/perl/NB/Encoding/x-sjis-unicode.enc create mode 100644 lib/perl/NB/Math/BaseCnv.pm diff --git a/lib/perl/NB/Encoding/FixLatin.pm b/lib/perl/NB/Encoding/FixLatin.pm new file mode 100644 index 00000000..15f67944 --- /dev/null +++ b/lib/perl/NB/Encoding/FixLatin.pm @@ -0,0 +1,415 @@ +package NB::Encoding::FixLatin; + +use warnings; +use strict; + +require 5.008; + +our $VERSION = '1.02'; + +use Carp qw(croak); +use Exporter qw(import); +use Encode qw(is_utf8 encode_utf8); + +our @EXPORT_OK = qw(fix_latin); + + +my $byte_map; + +my $ascii_str = qr{\A([\x00-\x7F]+)(.*)\z}s; + +my $cont_byte = '[\x80-\xBF]'; +my $utf8_2 = qr{\A([\xC0-\xDF])($cont_byte)(.*)\z}s; +my $utf8_3 = qr{\A([\xE0-\xEF])($cont_byte)($cont_byte)(.*)\z}s; +my $utf8_4 = qr{\A([\xF0-\xF7])($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; +my $utf8_5 = qr{\A([\xF8-\xFB])($cont_byte)($cont_byte)($cont_byte)($cont_byte)(.*)\z}s; + +my %known_opt = map { $_ => 1 } qw(bytes_only ascii_hex overlong_fatal); + +my %non_1252 = ( + "\x81" => '%81', + "\x8D" => '%8D', + "\x8F" => '%8F', + "\x90" => '%90', + "\x9D" => '%9D', +); + +sub fix_latin { + my $input = shift; + my %opt = ( + ascii_hex => 1, + bytes_only => 0, + overlong_fatal => 0, + @_ + ); + + foreach (keys %opt) { + croak "Unknown option '$_'" unless $known_opt{$_}; + } + + return unless defined($input); + _init_byte_map(\%opt) unless $byte_map; + + if(is_utf8($input)) { # input string already has utf8 flag set + if($opt{bytes_only}) { + return encode_utf8($input); + } + else { + return $input; + } + } + + my $output = ''; + my $char = ''; + my $rest = ''; + my $olf = $opt{overlong_fatal}; + while(length($input) > 0) { + if($input =~ $ascii_str) { + $output .= $1; + $rest = $2; + } + elsif($input =~ $utf8_2) { + $output .= _decode_utf8($olf, ord($1) & 0x1F, $1, $2); + $rest = $3; + } + elsif($input =~ $utf8_3) { + $output .= _decode_utf8($olf, ord($1) & 0x0F, $1, $2, $3); + $rest = $4; + } + elsif($input =~ $utf8_4) { + $output .= _decode_utf8($olf, ord($1) & 0x07, $1, $2, $3, $4); + $rest = $5; + } + elsif($input =~ $utf8_5) { + $output .= _decode_utf8($olf, ord($1) & 0x03, $1, $2, $3, $4, $5); + $rest = $6; + } + else { + ($char, $rest) = $input =~ /^(.)(.*)$/s; + if($opt{ascii_hex} && exists $non_1252{$char}) { + $output .= $non_1252{$char}; + } + else { + $output .= $byte_map->{$char}; + } + } + $input = $rest; + } + utf8::decode($output) unless $opt{bytes_only}; + return $output; +} + + +sub _decode_utf8 { + my $overlong_fatal = shift; + my $c = shift; + my $byte_count = @_; + foreach my $i (1..$#_) { + $c = ($c << 6) + (ord($_[$i]) & 0x3F); + } + my $bytes = encode_utf8(chr($c)); + if($overlong_fatal and $byte_count > length($bytes)) { + my $hex_bytes= join ' ', map { sprintf('%02X', ord($_)) } @_; + croak "Over-long UTF-8 byte sequence: $hex_bytes"; + } + return $bytes; +} + + +sub _init_byte_map { + foreach my $i (0x80..0xFF) { + my $utf_char = chr($i); + utf8::encode($utf_char); + $byte_map->{pack('C', $i)} = $utf_char; + } + _add_cp1252_mappings(); +} + + +sub _add_cp1252_mappings { + # From http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT + my %ms_map = ( + "\x80" => "\xE2\x82\xAC", # EURO SIGN + "\x82" => "\xE2\x80\x9A", # SINGLE LOW-9 QUOTATION MARK + "\x83" => "\xC6\x92", # LATIN SMALL LETTER F WITH HOOK + "\x84" => "\xE2\x80\x9E", # DOUBLE LOW-9 QUOTATION MARK + "\x85" => "\xE2\x80\xA6", # HORIZONTAL ELLIPSIS + "\x86" => "\xE2\x80\xA0", # DAGGER + "\x87" => "\xE2\x80\xA1", # DOUBLE DAGGER + "\x88" => "\xCB\x86", # MODIFIER LETTER CIRCUMFLEX ACCENT + "\x89" => "\xE2\x80\xB0", # PER MILLE SIGN + "\x8A" => "\xC5\xA0", # LATIN CAPITAL LETTER S WITH CARON + "\x8B" => "\xE2\x80\xB9", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK + "\x8C" => "\xC5\x92", # LATIN CAPITAL LIGATURE OE + "\x8E" => "\xC5\xBD", # LATIN CAPITAL LETTER Z WITH CARON + "\x91" => "\xE2\x80\x98", # LEFT SINGLE QUOTATION MARK + "\x92" => "\xE2\x80\x99", # RIGHT SINGLE QUOTATION MARK + "\x93" => "\xE2\x80\x9C", # LEFT DOUBLE QUOTATION MARK + "\x94" => "\xE2\x80\x9D", # RIGHT DOUBLE QUOTATION MARK + "\x95" => "\xE2\x80\xA2", # BULLET + "\x96" => "\xE2\x80\x93", # EN DASH + "\x97" => "\xE2\x80\x94", # EM DASH + "\x98" => "\xCB\x9C", # SMALL TILDE + "\x99" => "\xE2\x84\xA2", # TRADE MARK SIGN + "\x9A" => "\xC5\xA1", # LATIN SMALL LETTER S WITH CARON + "\x9B" => "\xE2\x80\xBA", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + "\x9C" => "\xC5\x93", # LATIN SMALL LIGATURE OE + "\x9E" => "\xC5\xBE", # LATIN SMALL LETTER Z WITH CARON + "\x9F" => "\xC5\xB8", # LATIN CAPITAL LETTER Y WITH DIAERESIS + ); + while(my($k, $v) = each %ms_map) { + $byte_map->{$k} = $v; + } +} + + +1; + +__END__ + +=head1 NAME + +Encoding::FixLatin - takes mixed encoding input and produces UTF-8 output + + +=head1 SYNOPSIS + + use NB::Encoding::FixLatin qw(fix_latin); + + my $utf8_string = fix_latin($mixed_encoding_string); + + +=head1 DESCRIPTION + +Most encoding conversion tools take input in one encoding and produce output in +another encoding. This module takes input which may contain characters in more +than one encoding and makes a best effort to convert them all to UTF-8 output. + + +=head1 EXPORTS + +Nothing is exported by default. The only public function is C which +will be exported on request (as per SYNOPSIS). + + +=head1 FUNCTIONS + +=head2 fix_latin( string, options ... ) + +Decodes the supplied 'string' and returns a UTF-8 version of the string. The +following rules are used: + +=over 4 + +=item * + +ASCII characters (single bytes in the range 0x00 - 0x7F) are passed through +unchanged. + +=item * + +Well-formed UTF-8 multi-byte characters are also passed through unchanged. + +=item * + +UTF-8 multi-byte character which are over-long but otherwise well-formed are +converted to the shortest UTF-8 normal form. + +=item * + +Bytes in the range 0xA0 - 0xFF are assumed to be Latin-1 characters (ISO8859-1 +encoded) and are converted to UTF-8. + +=item * + +Bytes in the range 0x80 - 0x9F are assumed to be Win-Latin-1 characters (CP1252 +encoded) and are converted to UTF-8. Except for the five bytes in this range +which are not defined in CP1252 (see the C option below). + +=back + +The achilles heel of these rules is that it's possible for certain combinations +of two consecutive Latin-1 characters to be misinterpreted as a single UTF-8 +character - ie: there is some risk of data corruption. See the 'LIMITATIONS' +section below to quantify this risk for the type of data you're working with. + +If you pass in a string that is already a UTF-8 character string (the utf8 flag +is set on the Perl scalar) then the string will simply be returned unchanged. +However if the 'bytes_only' option is specified (see below), the returned +string will be a byte string rather than a character string. The rules +described above will not be applied in either case. + +The C function accepts options as name => value pairs. Recognised +options are: + +=over 4 + +=item bytes_only => 1/0 + +The value returned by fix_latin is normally a Perl character string and will +have the utf8 flag set if it contains non-ASCII characters. If you set the +C option to a true value, the returned string will be a binary +string of UTF-8 bytes. The utf8 flag will not be set. This is useful if +you're going to immediately use the string in an IO operation and wish to avoid +the overhead of converting to and from Perl's internal representation. + +=item ascii_hex => 1/0 + +Bytes in the range 0x80-0x9F are assumed to be CP1252, however CP1252 does not +define a mapping for 5 of these bytes (0x81, 0x8D, 0x8F, 0x90 and 0x9D). Use +this option to specify how they should be handled: + +=over 4 + +=item * + +If the ascii_hex option is set to true (the default), these bytes will be +converted to 3 character ASCII hex strings of the form %XX. For example the +byte 0x81 will become %81. + +=item * + +If the ascii_hex option is set to false, these bytes will be treated as Latin-1 +control characters and converted to the equivalent UTF-8 multi-byte sequences. + +=back + +When processing text strings you will almost certainly never encounter these +bytes at all. The most likely reason you would see them is if a malicious +attacker was feeding random bytes to your application. It is difficult to +conceive of a scenario in which it makes sense to change this option from its +default setting. + +=item overlong_fatal => 1/0 + +An over-long UTF-8 byte sequence is one which uses more than the minimum number +of bytes required to represent the character. Use this option to specify how +overlong sequences should be handled. + +=over 4 + +=item * + +If the overlong_fatal option is set to false (the default) over-long sequences +will be converted to the shortest normal UTF-8 sequence. For example the input +byte string "\xC0\xBCscript>" would be converted to "