#!/usr/bin/env perl # create ltj-unicode-ccfix.lua by processing Unicode data file # LineBreak.txt # modified from unicode-char-prep.pl (part of the XeTeX typesetting system). # original copyright is as follows: # # /****************************************************************************\ # Part of the XeTeX typesetting system # Copyright (c) 1994-2008 by SIL International # Copyright (c) 2009 by Jonathan Kew # # SIL Author(s): Jonathan Kew # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # # Except as contained in this notice, the name of the copyright holders # shall not be used in advertising or otherwise to promote the sale, # use or other dealings in this Software without prior written # authorization from the copyright holders. # \****************************************************************************/ die "usage: perl $0 LineBreak.txt > ltj-unicode-ccfix.lua\n" unless $#ARGV == 0; sub parse_unidata { my (@u) = @_; $lccode{$u[0]} = $u[13] if $u[13] ne ''; $lccode{$u[0]} = $u[0] if $u[13] eq '' and ($u[2] =~ /^L/ or $u[12] ne ''); $uccode{$u[0]} = $u[12] if $u[12] ne ''; $uccode{$u[0]} = $u[0] if $u[12] eq '' and ($u[2] =~ /^L/ or $u[13] ne ''); if ($u[2] =~ /^L/) { push(@letters, $u[0]); } elsif ($u[2] =~ /^M/) { push(@marks, $u[0]); } elsif (exists $lccode{$u[0]} or exists $uccode{$u[0]}) { push(@casesym, $u[0]); } } my ($start, $end); $date = `date`; chomp $date; print << "__EOT__"; -- Do not edit this file! -- Created from LineBreak.txt by ltj-unicode-ccfix_make.pl on $date. -- In case of errors, fix the Perl script instead. __EOT__ %lineBreakClass = ( 'ID' => 1, # ideograph ); print << '__EOT__'; local tex_catcode = tex.setcatcode local tex_getcc = tex.getcatcode local function set_letter(b,e) if tex_getcc(b)~=11 then for i=b,e do tex_catcode('global', i, 11) end end end __EOT__ open LineBreak, $ARGV[0] or die "can't read $ARGV[0]"; while () { chomp; s/ *#.*//; s/ +$//; if (m/([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))?;(..)/) { $s = $1; $e = $2; $lb = $3; $e = $s if $e eq ''; if (exists $lineBreakClass{$lb}) { if ($lineBreakClass{$lb} == 1) { # ideographs: set whole range to class 1 print "set_letter(0x$s,0x$e)\n"; } } } } close LineBreak; print << '__EOT__'; __EOT__