#! /usr/bin/env perl # # texixml2texi -- convert Texinfo XML to Texinfo code # # Copyright 2012-2023 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Original author: Patrice Dumas use strict; use Getopt::Long qw(GetOptions); # for dirname. use File::Basename; use File::Spec; use Encode; Getopt::Long::Configure("gnu_getopt"); BEGIN { # emulate -w $^W = 1; my ($real_command_name, $command_directory, $command_suffix) = fileparse($0, '.pl'); my $updir = File::Spec->updir(); my $datadir = '@datadir@'; my $package = '@PACKAGE@'; my $xsdir = '@pkglibdir@'; # in-source run if ($datadir eq '@' .'datadir@' or defined($ENV{'TEXINFO_DEV_SOURCE'}) and $ENV{'TEXINFO_DEV_SOURCE'} ne '0') { # Use uninstalled modules if (defined($ENV{'top_builddir'})) { unshift @INC, File::Spec->catdir($ENV{'top_builddir'}, 'tp'); } else { unshift @INC, File::Spec->catdir($command_directory, $updir, 'tp'); } require Texinfo::ModulePath; Texinfo::ModulePath::init(undef, undef, undef, 'updirs' => 1); } else { # Look for modules in their installed locations. my $lib_dir = File::Spec->catdir($datadir, $package); # look for package data in the installed location. my $modules_pkgdatadir = $lib_dir; # try to make package relocatable, will only work if # standard relative paths are used if (! -f File::Spec->catfile($lib_dir, 'Texinfo', 'Parser.pm') and -f File::Spec->catfile($command_directory, $updir, 'share', $package, 'Texinfo', 'Parser.pm')) { $lib_dir = File::Spec->catdir($command_directory, $updir, 'share', $package); $modules_pkgdatadir = File::Spec->catdir($command_directory, $updir, 'share', $package); $xsdir = File::Spec->catdir($command_directory, $updir, 'lib', $package); } unshift @INC, $lib_dir; require Texinfo::ModulePath; Texinfo::ModulePath::init($lib_dir, $xsdir, $modules_pkgdatadir, 'installed' => 1); } } use XML::LibXML::Reader; use Texinfo::Commands; # gather information on Texinfo markup language elements use Texinfo::Convert::TexinfoMarkup; my $debug = 0; my $result_options = Getopt::Long::GetOptions ( 'debug|d' => \$debug, ); # to get unbuffered output if ($debug) { my $previous_default = select(STDOUT); # save previous default $|++; # autoflush STDOUT select(STDERR); $|++; # autoflush STDERR, to be sure select($previous_default); # restore previous default } sub command_with_braces($) { my $command = shift; if ($command =~ /^[a-z]/i) { return "\@".$command.'{}'; } else { return "\@".$command; } } my %ignored_elements = ( 'prepend' => 1, 'formalarg' => 1, # not ignored everytime 'indexterm' => 1, ); my %elements_end_attributes = ( 'accent' => 1, 'menunode' => 1, 'menutitle' => 1, ); # keys are markup elements. If the element is associated to one @-command # only, the value is a string, the corresponding @-command formatted. # If the element is associated to more than one element, the value is a # hash to select the command based on an attribute value. They key of the # hash attribute is an attribute name and the value is another hash # reference which associates an attribute value to the formatted @-command # string. my %element_at_commands; # entities not associated to @-commands my %entity_texts = ( 'textldquo' => '``', 'textrdquo' => "''", 'textmdash' => '---', 'textndash' => '--', 'textrsquo' => "'", 'textlsquo' => '`', 'formfeed' => "\f", 'verticaltab' => "\x{000B}", # following mappings are not used in pratice, as attrformfeed and similar # appear in attributes and thus are already expanded to text. 'attrformfeed' => "\f", 'attrverticaltab' => "\x{000B}", ); # contains nobrace symbol and brace noarg commands my %no_arg_commands_formatting = %Texinfo::Convert::TexinfoMarkup::no_arg_commands_formatting; foreach my $command (keys(%no_arg_commands_formatting)) { if (!ref($no_arg_commands_formatting{$command})) { $entity_texts{$no_arg_commands_formatting{$command}} = command_with_braces($command); } else { my $spec = $no_arg_commands_formatting{$command}; my $element = $spec->[0]; if ($element eq 'spacecmd') { if ($spec->[1]->[0] eq 'type') { $element_at_commands{$element}->{"type"}->{$spec->[1]->[1]} = command_with_braces($command); } else { die "BUG, bad spacecmd specification"; } } else { $element_at_commands{$element} = command_with_braces($command); } } } my %arg_elements; my %variadic_elements; foreach my $command (keys(%Texinfo::Convert::TexinfoMarkup::commands_args_elements)) { my $arg_index = 0; my @element_arguments = @{$Texinfo::Convert::TexinfoMarkup::commands_args_elements{$command}}; foreach my $element_argument (@element_arguments) { if ($element_argument ne '*') { # need to disambiguate by command as some markup argument elements # are common, for @*ref, for example. It should not matter, in general, # if the argument indices are the same, which should be the case, but # it is cleaner anyway. $arg_elements{$element_argument}->{$command} = $arg_index; } else { my $previous_element_argument = $element_arguments[$arg_index - 1]; $variadic_elements{$previous_element_argument}->{$command} = $arg_index; } $arg_index++; } } my %accent_type_command; foreach my $accent_command (keys(%Texinfo::Convert::TexinfoMarkup::accent_types)) { $accent_type_command{$Texinfo::Convert::TexinfoMarkup::accent_types{$accent_command}} = $accent_command; } my %eat_space_elements; foreach my $element ('texinfo', 'filename') { $eat_space_elements{$element} = 1; } my $infile = shift @ARGV; if (!defined($infile) or $infile !~ /\S/) { die "Missing file\n"; } my $reader_options = {'location' => $infile, 'expand_entities' => 0, 'pedantic_parser' => $debug, #'validation' => 1, }; my $reader = XML::LibXML::Reader->new($reader_options) or die "cannot read $infile\n"; #(my $mydir = $0) =~ s,/[^/]*$,,; # dir we are in #my $txi_dtd_libdir = "$mydir"; # find tp relative to $0 sub skip_until_end($$) { my $reader = shift; my $name = shift; while ($reader->read) { if ($reader->nodeType() eq XML_READER_TYPE_END_ELEMENT and $reader->name eq $name) { return; } } } sub unprotect_spaces($) { my $spaces = shift; if (defined($spaces)) { $spaces =~ s/\\n/\n/g; # convert back formfeed and other special characters $spaces =~ s/\\f/\f/g; $spaces =~ s/\\v/\x{000B}/g; return $spaces; } else { return ''; } } my $eat_space = 0; my $skip_comment = 0; my @commands_with_args_stack; while ($reader->read) { # ============================================================ begin debug if ($debug) { my $args_stack_str = join('|', map {'['.$_->[0].','.$_->[1].']'} @commands_with_args_stack); printf STDERR "(args: $args_stack_str) (eat_space $eat_space) (skip_comment $skip_comment) %d %d %s %d", ( $reader->depth, $reader->nodeType, $reader->name, $reader->isEmptyElement); my $value = ''; if ($reader->hasValue()) { $value = $reader->value(); $value =~ s/\n/\\n/g; print STDERR " |$value|"; } if ($reader->nodeType() eq XML_READER_TYPE_ELEMENT and $reader->hasAttributes() and defined($reader->getAttribute('spaces'))) { my $spaces = $reader->getAttribute('spaces'); print STDERR " spaces:$spaces|"; } print STDERR "\n"; } # ============================================================ end debug if ($reader->nodeType() eq XML_READER_TYPE_SIGNIFICANT_WHITESPACE and $eat_space) { $eat_space = 0; next; } elsif ($reader->nodeType() eq XML_READER_TYPE_TEXT or $reader->nodeType() eq XML_READER_TYPE_WHITESPACE or $reader->nodeType() eq XML_READER_TYPE_SIGNIFICANT_WHITESPACE ) { if ($reader->hasValue()) { print $reader->value(); } } my $name = $reader->name; if ($reader->nodeType() eq XML_READER_TYPE_ELEMENT) { my $user_defined_index_command; if (($name eq 'entry' or $name eq 'indexcommand') and $reader->hasAttributes() and defined($reader->getAttribute('command'))) { $name = $reader->getAttribute('command'); $user_defined_index_command = 1; } elsif ($name eq 'listitem') { $name = 'item'; } if ($Texinfo::Convert::TexinfoMarkup::commands_args_elements{$name}) { push @commands_with_args_stack, [$name, 0]; } my $spaces = unprotect_spaces($reader->getAttribute('spaces')); my $spaces_after_command = unprotect_spaces($reader->getAttribute('spacesaftercmd')); if ($name eq 'accent') { if ($reader->hasAttributes()) { if (defined($reader->getAttribute('type'))) { my $command = $accent_type_command{$reader->getAttribute('type')}; print "\@${command}${spaces_after_command}" if (defined($command)); } if (!(defined($reader->getAttribute('bracketed')) and $reader->getAttribute('bracketed') eq 'off')) { print '{'; } } else { print '{'; } } elsif (exists $element_at_commands{$name}) { if (!ref($element_at_commands{$name})) { print $element_at_commands{$name}; } else { my ($attribute) = keys(%{$element_at_commands{$name}}); if ($reader->hasAttributes() and defined($reader->getAttribute($attribute))) { print $element_at_commands{$name}->{$attribute}->{ $reader->getAttribute($attribute)}; } } } elsif (exists($Texinfo::Commands::brace_commands{$name})) { print "\@${name}${spaces_after_command}".'{'; if ($name eq 'verb' and $reader->hasAttributes() and defined($reader->getAttribute('delimiter'))) { print $reader->getAttribute('delimiter'); } print "$spaces"; } elsif (exists($Texinfo::Commands::block_commands{$name})) { print "\@$name"; if ($name eq 'macro' or $name eq 'rmacro') { if ($reader->hasAttributes() and defined($reader->getAttribute('line'))) { print $reader->getAttribute('line'); } } else { # leading spaces are already in the line attribute for (r)macro print "$spaces"; } } elsif (defined($Texinfo::Commands::line_commands{$name}) or defined($Texinfo::Commands::nobrace_commands{$name}) or $user_defined_index_command) { if ($reader->hasAttributes() and defined($reader->getAttribute('originalcommand'))) { $name = $reader->getAttribute('originalcommand'); } if ($name eq 'documentencoding' and $reader->hasAttributes() and defined($reader->getAttribute('encoding'))) { my $perl_encoding = Encode::resolve_alias($reader->getAttribute('encoding')); if (defined($perl_encoding)) { if ($debug) { print STDERR "Using encoding $perl_encoding\n"; } binmode(STDOUT, ":encoding($perl_encoding)"); } } print "\@$name$spaces"; if ($reader->hasAttributes() and defined($reader->getAttribute('line'))) { my $line = $reader->getAttribute('line'); $line =~ s/\\\\/\x{1F}/g; # convert back formfeed $line =~ s/\\f/\f/g; $line =~ s/\x{1F}/\\/g; # FIXME needed? #$line =~ s/\\v/\x{000B}/g; print $line; } my $specific_line = (defined($Texinfo::Commands::line_commands{$name}) and $Texinfo::Commands::line_commands{$name} eq 'specific'); if ($name eq 'set' or $name eq 'clickstyle' or $name eq 'columnfractions' or $specific_line) { skip_until_end($reader, $name); if ($name eq 'columnfractions' or $specific_line) { # specific line commands have a line argument obtained by converting # their line to Texinfo, which would include a comment on the line, # and could also have a comment associated to the command appearing # after the command as an XML comment. Similar for columnfraction. # # We skip the possibly existing redundant XML comment following the # closing element. # # start at 2 as there is a -1 right down at the end of # the loop, and another -1 for the next element (possibly # an ignored comment). $skip_comment = 2; } } } elsif ($arg_elements{$name}) { # elements corresponding to @-commands arguments if ($reader->hasAttributes() and defined($reader->getAttribute('automatic')) and $reader->getAttribute('automatic') eq 'on') { skip_until_end($reader, $name); next; } my ($command, $current_index) = @{$commands_with_args_stack[-1]}; my $arg_element_index = $arg_elements{$name}->{$command}; if ($commands_with_args_stack[-1]->[1] < $arg_element_index) { while ($commands_with_args_stack[-1]->[1] < $arg_element_index) { $commands_with_args_stack[-1]->[1]++; print ','; } } elsif ($commands_with_args_stack[-1]->[1] > 0) { # the index is already at or above the argument index. Either it is # a variadic command, or an incorrect input. if ($variadic_elements{$name} and defined($variadic_elements{$name}->{$command})) { $commands_with_args_stack[-1]->[1]++; print ','; # a debug consistency check my $variadic_arg_index = $variadic_elements{$name}->{$command}; if ($commands_with_args_stack[-1]->[1] < $variadic_arg_index) { print STDERR "BUG: $command: $name: current index < variadic_arg_index: " ."$commands_with_args_stack[-1]->[1] < $variadic_arg_index\n" if ($debug); } } else { # could happen with duplicate markup argument elements and # markup argument elements in the wrong order print STDERR "BAD INPUT: $command: $name(not variadic): " ."current index >= arg_element_index: " ."$commands_with_args_stack[-1]->[1] >= $arg_element_index\n" if ($debug); } } print "$spaces"; } elsif ($ignored_elements{$name}) { my $keep_indexterm = 0; if ($name eq 'indexterm') { my $node_path = $reader->nodePath(); if ($node_path =~ m:([a-z]+)/indexterm$:) { my $parent = $1; if ($parent =~ /^[a-z]?[a-z]index$/ or $parent eq 'indexcommand') { $keep_indexterm = 1; } } } if (!$keep_indexterm) { #print STDERR "IGNORE $name\n" if ($debug); skip_until_end($reader, $name); next; } } elsif ($name eq 'formattingcommand') { if ($reader->hasAttributes()) { if (defined($reader->getAttribute('command')) and (not (defined($reader->getAttribute('automatic')) and $reader->getAttribute('automatic') eq 'on'))) { print '@'.$reader->getAttribute('command'); } } } elsif ($name eq 'infoenclose') { if ($reader->hasAttributes() and defined($reader->getAttribute('command'))) { my $command = $reader->getAttribute('command'); print "\@${command}${spaces_after_command}".'{'."$spaces"; } # def* automatic } elsif ($reader->hasAttributes() and defined($reader->getAttribute('automatic')) and $reader->getAttribute('automatic') eq 'on') { skip_until_end($reader, $name); # eat the following space $reader->read(); } elsif ($eat_space_elements{$name}) { $eat_space = 1; } else { print STDERR "UNKNOWN $name\n" if ($debug); } if ($reader->hasAttributes()) { if (defined($reader->getAttribute('bracketed')) and $reader->getAttribute('bracketed') eq 'on') { print '{'."$spaces"; } # menus 'star' and following spaces if (defined($reader->getAttribute('leadingtext'))) { print $reader->getAttribute('leadingtext'); } } if ($Texinfo::Commands::block_commands{$name} and $Texinfo::Commands::block_commands{$name} eq 'item_line' and $reader->hasAttributes() and defined($reader->getAttribute('commandarg'))) { # happens when formatting command argument is missing and there # are no spaces. print ' ' if ($spaces eq ''); print '@'.$reader->getAttribute('commandarg'); } } elsif ($reader->nodeType() eq XML_READER_TYPE_END_ELEMENT) { if ($Texinfo::Convert::TexinfoMarkup::commands_args_elements{$name}) { pop @commands_with_args_stack; } my $trailingspaces = ''; if ($reader->hasAttributes() and defined($reader->getAttribute('trailingspaces'))) { $trailingspaces = unprotect_spaces($reader->getAttribute('trailingspaces')); } if ($reader->hasAttributes()) { if (defined($reader->getAttribute('bracketed')) and $reader->getAttribute('bracketed') eq 'on') { print "$trailingspaces"; # such that spaces are not prepended below when prepended # for elements without bracketed attribute below $trailingspaces = ''; print '}'; } } if (exists ($Texinfo::Commands::brace_commands{$name})) { if ($name eq 'verb' and $reader->hasAttributes() and defined($reader->getAttribute('delimiter'))) { print $reader->getAttribute('delimiter'); } print '}'; } elsif (exists($Texinfo::Commands::block_commands{$name})) { my $end_spaces; if ($reader->hasAttributes() and defined($reader->getAttribute('endspaces'))) { $end_spaces = $reader->getAttribute('endspaces'); } $end_spaces = ' ' if (!defined($end_spaces) or $end_spaces eq ''); print "\@end".$end_spaces."$name"; } elsif (defined($Texinfo::Commands::line_commands{$name}) or defined($Texinfo::Commands::nobrace_commands{$name})) { if ($Texinfo::Commands::root_commands{$name} and $name ne 'node') { $eat_space = 1; } print "$trailingspaces"; } elsif ($elements_end_attributes{$name}) { if ($name eq 'accent') { if ($reader->hasAttributes()) { if (!(defined($reader->getAttribute('bracketed')) and $reader->getAttribute('bracketed') eq 'off')) { print '}'; } } else { print '}'; } } elsif ($reader->hasAttributes() and defined($reader->getAttribute('separator'))) { print $reader->getAttribute('separator'); print "$trailingspaces"; } } elsif ($name eq 'infoenclose') { print "$trailingspaces".'}'; } elsif ($eat_space_elements{$name}) { $eat_space = 1; } else { print STDERR "END UNKNOWN $name\n" if ($debug); print "$trailingspaces"; } } elsif ($reader->nodeType() eq XML_READER_TYPE_ENTITY_REFERENCE) { # for some reason XML_READER_TYPE_ENTITY is never emitted # or $reader->nodeType() eq XML_READER_TYPE_ENTITY) { if (defined($entity_texts{$name})) { print $entity_texts{$name}; } } elsif ($reader->nodeType() eq XML_READER_TYPE_COMMENT) { my $comment; if ($reader->hasValue()) { $comment = $reader->value(); $comment =~ s/^ (comment|c)//; my $command = $1; $comment =~ s/ $//; print "\@${command}$comment" unless ($skip_comment); } } elsif ($reader->nodeType() eq XML_READER_TYPE_DOCUMENT_TYPE) { $eat_space = 1; } $skip_comment-- if ($skip_comment); } 1;