# NodeNameNormalization.pm: output tree as normalized node name. # # Copyright 2010-2023 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Original author: Patrice Dumas # the rules for conversion are decribed in the Texinfo manual, for # HTML crossrefs in the 'HTML Xref' node. package Texinfo::Convert::NodeNameNormalization; use 5.00405; use strict; # stop \s from matching non-ASCII spaces, etc. \p{...} can still be # used to match Unicode character classes. use if $] >= 5.014, re => '/a'; # To check if there is no erroneous autovivification #no autovivification qw(fetch delete exists store strict); use Unicode::Normalize; use Text::Unidecode; # commands classes use Texinfo::Commands; # for nobrace_symbol_text use Texinfo::Common; # use the hashes and functions use Texinfo::Convert::Unicode; # reuse some conversion hashes and ascii_accent function use Texinfo::Convert::Text; require Exporter; use vars qw($VERSION @ISA @EXPORT_OK %EXPORT_TAGS); @ISA = qw(Exporter); %EXPORT_TAGS = ( 'all' => [ qw( normalize_node normalize_transliterate_texinfo transliterate_texinfo transliterate_protect_file_name ) ] ); @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); $VERSION = '7.1'; my %normalize_node_brace_no_arg_commands = %Texinfo::Convert::Text::text_brace_no_arg_commands; foreach my $command (keys(%Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands)) { $normalize_node_brace_no_arg_commands{$command} = $Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands{$command}; } my %normalize_node_nobrace_symbol_text = %Texinfo::Common::nobrace_symbol_text; $normalize_node_nobrace_symbol_text{'*'} = ' '; my %accent_commands = %Texinfo::Commands::accent_commands; my %ignored_brace_commands; foreach my $ignored_brace_command ('anchor', 'footnote', 'shortcaption', 'caption', 'hyphenation', 'sortas', 'seealso', 'seeentry') { $ignored_brace_commands{$ignored_brace_command} = 1; } my %ignored_types; foreach my $type ('ignorable_spaces_after_command', 'postamble_after_end', 'preamble_before_beginning', 'spaces_at_end', 'spaces_before_paragraph', 'space_at_end_menu_node', 'spaces_after_close_brace') { $ignored_types{$type} = 1; } sub normalize_node($) { my $root = shift; my $result = _convert($root); $result = Unicode::Normalize::NFC($result); $result = _unicode_to_protected($result); $result = 'Top' if ($result =~ /^Top$/i); return $result; } sub normalize_transliterate_texinfo($;$) { my $root = shift; my $no_unidecode = shift; my $result = _convert($root); $result = Unicode::Normalize::NFC($result); $result = _unicode_to_protected( _unicode_to_transliterate($result, $no_unidecode)); return $result; } sub transliterate_texinfo($;$) { my $root = shift; my $no_unidecode = shift; my $result = _convert($root); $result = Unicode::Normalize::NFC($result); $result = _unicode_to_transliterate($result, $no_unidecode); return $result; } sub transliterate_protect_file_name($;$) { my $input_text = shift; my $no_unidecode = shift; my $result = Unicode::Normalize::NFC($input_text); $result = _unicode_to_file_name( _unicode_to_transliterate($result, $no_unidecode)); return $result; } sub convert_to_normalized($) { my $root = shift; my $result = _convert($root); } sub _protect_unicode_char($) { my $char = shift; if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) { return '_' . lc($Texinfo::Convert::Unicode::unicode_simple_character_map{$char}); } else { if (ord($char) <= hex(0xFFFF)) { return '_' . lc(sprintf("%04x",ord($char))); } else { return '__' . lc(sprintf("%06x",ord($char))); } } } sub _unicode_to_protected($) { my $text = shift; my $result = ''; while ($text ne '') { if ($text =~ s/^([A-Za-z0-9]+)//) { $result .= $1; } elsif ($text =~ s/^ +//) { $result .= '-'; # with /a some special spaces are not caught without /s, maybe because they are # considered as newlines } elsif ($text =~ s/^(.)//s) { $result .= _protect_unicode_char($1); } else { warn "Bug: unknown character _unicode_to_protected (likely in infinite loop)\n"; print STDERR "Text: !!$text!!\n"; sleep 1; } } return $result; } sub _unicode_to_file_name($) { my $text = shift; my $result = ''; while ($text ne '') { if ($text =~ s/^([A-Za-z0-9_\.\-]+)//) { $result .= $1; } elsif ($text =~ s/^ +//) { $result .= '-'; # /s is specified to caught special spaces considered as newlines too } elsif ($text =~ s/^(.)//s) { $result .= _protect_unicode_char($1); } else { warn "Bug: unknown character _unicode_to_file_name (likely in infinite loop)\n"; print STDERR "Text: !!$text!!\n"; sleep 1; } } return $result; } sub _unicode_to_transliterate($;$) { my $text = shift; my $no_unidecode = shift; if (chomp($text)) { warn "Bug: end of line to transliterate: $text\n"; } my $result = ''; while ($text ne '') { if ($text =~ s/^([A-Za-z0-9 ]+)//) { $result .= $1; } elsif ($text =~ s/^(.)//s) { my $char = $1; if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) { $result .= $char; } elsif (ord($char) <= hex(0xFFFF) and exists($Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))})) { $result .= $Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))}; } elsif (ord($char) <= hex(0xFFFF) and exists($Texinfo::Convert::Unicode::diacritics_accent_commands{uc(sprintf("%04x",ord($char)))})) { $result .= ''; # in this case, we want to avoid calling unidecode, as we are sure # that there is no useful transliteration of the unicode character # instead we want to keep it as is. # This is the case, for example, for @exclamdown, it corresponds # with x00a1, but unidecode transliterates it to a !, we want # to avoid that and keep x00a1. } elsif (ord($char) <= hex(0xFFFF) and exists($Texinfo::Convert::Unicode::no_transliterate_map{uc(sprintf("%04x",ord($char)))})) { $result .= $char; } else { if ($no_unidecode) { if (ord($char) <= hex(0xFFFF) and exists ($Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))})) { $result .= $Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))}; } else { $result .= $char; } } else { $result .= unidecode($char); } } #print STDERR " ($no_unidecode) $text -> CHAR: ".ord($char)." ".uc(sprintf("%04x",ord($char)))."\n$result\n"; } else { warn "Bug: unknown character _unicode_to_transliterate (likely in infinite loop)\n"; print STDERR "Text: !!$text!!\n"; sleep 1; } } return $result; } sub _convert($); sub _convert($) { my $element = shift; return '' if (($element->{'type'} and $ignored_types{$element->{'type'}}) or ($element->{'cmdname'} and ($ignored_brace_commands{$element->{'cmdname'}} # here ignore the line commands or ($element->{'args'} and $element->{'args'}->[0] and $element->{'args'}->[0]->{'type'} and ($element->{'args'}->[0]->{'type'} eq 'line_arg' or $element->{'args'}->[0]->{'type'} eq 'rawline_arg'))))); my $result = ''; if (defined($element->{'text'})) { $result = $element->{'text'}; $result =~ s/\s+/ /g; } if ($element->{'cmdname'}) { my $command = $element->{'cmdname'}; if (defined($normalize_node_nobrace_symbol_text{$element->{'cmdname'}})) { return $normalize_node_nobrace_symbol_text{$element->{'cmdname'}}; } elsif (defined($normalize_node_brace_no_arg_commands{$element->{'cmdname'}})) { $command = $element->{'extra'}->{'clickstyle'} if ($element->{'extra'} and defined($element->{'extra'}->{'clickstyle'}) and defined($normalize_node_brace_no_arg_commands{$element->{'extra'}->{'clickstyle'}})); my $result = $normalize_node_brace_no_arg_commands{$command}; return $result; # commands with braces } elsif ($accent_commands{$element->{'cmdname'}}) { return '' if (!$element->{'args'}); my $accent_text = _convert($element->{'args'}->[0]); my $accented_char = Texinfo::Convert::Unicode::unicode_accent($accent_text, $element); if (!defined($accented_char)) { # In this case, the node normalization do not follow the specification, # but we cannot do better $accented_char = Texinfo::Convert::Text::ascii_accent($accent_text, $element); } return $accented_char; } elsif ($Texinfo::Commands::ref_commands{$element->{'cmdname'}}) { my @args_try_order; if ($element->{'cmdname'} eq 'inforef') { @args_try_order = (0, 1, 2); } else { @args_try_order = (0, 1, 2, 4, 3); } foreach my $index (@args_try_order) { if (defined($element->{'args'}->[$index])) { my $text = _convert($element->{'args'}->[$index]); return $text if (defined($text) and $text =~ /\S/); } } return ''; # Here all the commands with args are processed, if they have # more than one arg the first one is used. } elsif ($element->{'args'} and $element->{'args'}->[0] and (($element->{'args'}->[0]->{'type'} and $element->{'args'}->[0]->{'type'} eq 'brace_command_arg') or $element->{'cmdname'} eq 'math')) { return _convert($element->{'args'}->[0]); } } if ($element->{'contents'}) { foreach my $content (@{$element->{'contents'}}) { $result .= _convert($content); } } return $result; } # Called from Texinfo::ParserNonXS and Texinfo::XS::parsetexi::Parsetexi. # This should be considered an internal function of the parsers for all # purposes, it is here to avoid code duplication. # Sets $self->{'nodes'} and $self->{'labels'} based on $self->{'targets'}. sub set_nodes_list_labels($$$) { my $self = shift; my $registrar = shift; my $configuration_information = shift; $self->{'nodes'} = []; my %labels = (); if (defined $self->{'targets'}) { for my $target (@{$self->{'targets'}}) { if ($target->{'cmdname'} eq 'node') { for (my $i = 1; $i < scalar(@{$target->{'args'}}); $i++) { my $arg = $target->{'args'}->[$i]; if ($arg->{'extra'} and $arg->{'extra'}->{'node_content'}) { my $normalized = Texinfo::Convert::NodeNameNormalization::normalize_node( {'contents' => $arg->{'extra'}->{'node_content'}}); $arg->{'extra'}->{'normalized'} = $normalized; } } } my $label_element = Texinfo::Common::get_label_element($target); if ($label_element and $label_element->{'contents'}) { my $normalized = Texinfo::Convert::NodeNameNormalization::normalize_node( $label_element); if ($normalized !~ /[^-]/) { $registrar->line_error($configuration_information, sprintf(__("empty node name after expansion `%s'"), # convert the contents only, to avoid spaces Texinfo::Convert::Texinfo::convert_to_texinfo( {'contents' => $label_element->{'contents'}})), $target->{'source_info'}); } else { if (defined $labels{$normalized}) { $registrar->line_error($configuration_information, sprintf(__("\@%s `%s' previously defined"), $target->{'cmdname'}, Texinfo::Convert::Texinfo::convert_to_texinfo( {'contents' => $label_element->{'contents'}})), $target->{'source_info'}); $registrar->line_error($configuration_information, sprintf(__("here is the previous definition as \@%s"), $labels{$normalized}->{'cmdname'}), $labels{$normalized}->{'source_info'}, 1); } else { $labels{$normalized} = $target; $target->{'extra'} = {} if (!$target->{'extra'}); $target->{'extra'}->{'normalized'} = $normalized; if ($target->{'cmdname'} eq 'node') { push @{$self->{'nodes'}}, $target; } } } } else { if ($target->{'cmdname'} eq 'node') { $registrar->line_error($configuration_information, sprintf(__("empty argument in \@%s"), $target->{'cmdname'}), $target->{'source_info'}); } } } } $self->{'labels'} = \%labels; } sub _parse_float_type($) { my $current = shift; my $normalized = ''; if ($current->{'args'} and scalar(@{$current->{'args'}})) { $normalized = convert_to_normalized($current->{'args'}->[0]); } $current->{'extra'} = {} if (!$current->{'extra'}); $current->{'extra'}->{'float_type'} = $normalized; return $normalized; } # Called from Texinfo::ParserNonXS and Texinfo::XS::parsetexi::Parsetexi. # This should be considered an internal function of the parsers for all # purposes, it is here to avoid code duplication. sub set_float_types { my $self = shift; $self->{'floats'} = {}; my $global_commands = $self->global_commands_information(); return unless ($global_commands); if ($global_commands->{'float'}) { foreach my $current (@{$global_commands->{'float'}}) { my $float_type = _parse_float_type($current); push @{$self->{'floats'}->{$float_type}}, $current; } } if ($global_commands->{'listoffloats'}) { foreach my $current (@{$global_commands->{'listoffloats'}}) { _parse_float_type($current); } } } 1; __END__ =head1 NAME Texinfo::Convert::NodeNameNormalization - Normalize and transliterate Texinfo trees =head1 SYNOPSIS use Texinfo::Convert::NodeNameNormalization qw(normalize_node normalize_transliterate_texinfo); my $normalized = normalize_node({'contents' => $node_contents}); my $file_name = normalize_transliterate_texinfo({'contents' => $section_contents}); =head1 NOTES The Texinfo Perl module main purpose is to be used in C to convert Texinfo to other formats. There is no promise of API stability. =head1 DESCRIPTION C allows to normalize node names, with C following the specification described in the Texinfo manual I node. This is useful whenever one want a unique identifier for Texinfo content, which is only composed of letter, digits, C<-> and C<_>. In L, C is used for C<@node>, C<@float> and C<@anchor> names normalization, but also C<@float> types and C<@acronym> and C<@abbr> first argument. It is also possible to transliterate non-ASCII letters, instead of mangling them, with C, losing the uniqueness feature of normalized node names. Another method, C transliterates non-ASCII letters and protect characters that should not appear on file names. =head1 METHODS =over =item $partially_normalized = convert_to_normalized($tree) X> The Texinfo I<$tree> is returned as a string, with @-commands and spaces normalized as described in the Texinfo manual I node. ASCII 7-bit characters other than spaces and non-ASCII characters are left as is in the resulting string. =item $normalized = normalize_node($tree) X> The Texinfo I<$tree> is returned as a string, normalized as described in the Texinfo manual I node. The result will be poor for Texinfo trees which are not @-command arguments (on an @-command line or in braces), for instance if the tree contains C<@node> or block commands. =item $transliterated = normalize_transliterate_texinfo($tree, $no_unidecode) X> The Texinfo I<$tree> is returned as a string, with non-ASCII letters transliterated as ASCII, but otherwise similar with C output. If the optional I<$no_unidecode> argument is set, C is not used for characters whose transliteration is not built-in. =item $transliterated = transliterate_texinfo($tree, $no_unidecode) X> The Texinfo I<$tree> is returned as a string, with non-ASCII letters transliterated as ASCII. If the optional I<$no_unidecode> argument is set, C is not used for characters whose transliteration is not built-in. =item $file_name = transliterate_protect_file_name($string, $no_unidecode) X> The string I<$string> is returned with non-ASCII letters transliterated as ASCII, and ASCII characters not safe in file names protected as in node normalization. If the optional I<$no_unidecode> argument is set, C is not used for characters whose transliteration is not built-in. =back =head1 AUTHOR Patrice Dumas, Epertusus@free.frE =head1 COPYRIGHT AND LICENSE Copyright 2010- Free Software Foundation, Inc. See the source file for all copyright years. This library is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. =cut