#!/usr/bin/perl -w use strict; # This uses every kludge I could think of to cut down the size of the datafile # You have been warned. $/="\0\012"; print "#include \"japanese_data.h\"\n\n"; print 'const unsigned short japanese_kanjitab[]= { '; my $current=""; my @dict; my $num=1; my $off=0; my $coff=0; my $orig_entries=0; my $kanastr=""; my %kanaidx=(); my $ckana=""; while (<>) { chomp; $orig_entries++; my ($kana, $w)=split("\0 "); if ($current ne $kana) { my $tmp=""; my $n=0; $n++ while (substr($kana, $n*2, 2) eq substr($current, $n*2, 2)); $tmp=chr($n); $tmp.=substr($kana, ($n++)*2+1, 1) while ($n*2 < length($kana)); if (!exists $kanaidx{$tmp}) { $kanaidx{$tmp}=length($kanastr); $kanastr.=$tmp; } if ($num>62) { die "found a group of more than 62 homonyms - argh"; } if ($current) { push @dict, $kanaidx{$ckana}, $coff, $num; } $current=$kana; $num=1; $coff=$off; $ckana=$tmp; } else { $num++; }; my @w=unpack("n*", $w); my $first=1; while (@w) { my $a=shift @w; if ($a < 0x3000 || $a > 0xafff) { warn ("Character out of range : $a at line $."); } print sprintf("0x%x", $a-0x3000 + $first*0x8000); $first=0; if (@w) { print ", "; } } if (!eof) { print ", \n"; }; $off+=length($w)/2; } print " };\n\n"; print "const unsigned char japanese_kanatab[] = { \n "; for (my $i=0; $i 0x7fffff) { die "kanji table too long - can't kludge properly ^^;"; } print " { $w, "; print $o + 0x1000000 * $n; print " },\n"; } print " { ", length($kanastr), ", 0 } \n};\n"; print STDERR "jp_data_toc.pl : processed $orig_entries dictionary entries\n"; print STDERR " found ", $kana_entries, " phonetic entries\n"; print STDERR " printed ", 1+length($kanastr), " bytes of phonetic table\n"; print STDERR " printed ", $off, " words of dictionary table\n";