#! /usr/bin/env perl
#
# texixml2texi -- convert Texinfo XML to Texinfo code
#
# Copyright 2012-2023 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# Original author: Patrice Dumas
use strict;
use Getopt::Long qw(GetOptions);
# for dirname.
use File::Basename;
use File::Spec;
use Encode;
Getopt::Long::Configure("gnu_getopt");
BEGIN
{
# emulate -w
$^W = 1;
my ($real_command_name, $command_directory, $command_suffix)
= fileparse($0, '.pl');
my $updir = File::Spec->updir();
my $datadir = '@datadir@';
my $package = '@PACKAGE@';
my $xsdir = '@pkglibdir@';
# in-source run
if ($datadir eq '@' .'datadir@'
or defined($ENV{'TEXINFO_DEV_SOURCE'})
and $ENV{'TEXINFO_DEV_SOURCE'} ne '0') {
# Use uninstalled modules
if (defined($ENV{'top_builddir'})) {
unshift @INC, File::Spec->catdir($ENV{'top_builddir'}, 'tp');
} else {
unshift @INC, File::Spec->catdir($command_directory, $updir, 'tp');
}
require Texinfo::ModulePath;
Texinfo::ModulePath::init(undef, undef, undef, 'updirs' => 1);
} else {
# Look for modules in their installed locations.
my $lib_dir = File::Spec->catdir($datadir, $package);
# look for package data in the installed location.
my $modules_pkgdatadir = $lib_dir;
# try to make package relocatable, will only work if
# standard relative paths are used
if (! -f File::Spec->catfile($lib_dir, 'Texinfo', 'Parser.pm')
and -f File::Spec->catfile($command_directory, $updir, 'share',
$package, 'Texinfo', 'Parser.pm')) {
$lib_dir = File::Spec->catdir($command_directory, $updir,
'share', $package);
$modules_pkgdatadir = File::Spec->catdir($command_directory, $updir,
'share', $package);
$xsdir = File::Spec->catdir($command_directory, $updir,
'lib', $package);
}
unshift @INC, $lib_dir;
require Texinfo::ModulePath;
Texinfo::ModulePath::init($lib_dir, $xsdir, $modules_pkgdatadir,
'installed' => 1);
}
}
use XML::LibXML::Reader;
use Texinfo::Commands;
# gather information on Texinfo markup language elements
use Texinfo::Convert::TexinfoMarkup;
my $debug = 0;
my $result_options = Getopt::Long::GetOptions (
'debug|d' => \$debug,
);
# to get unbuffered output
if ($debug) {
my $previous_default = select(STDOUT); # save previous default
$|++; # autoflush STDOUT
select(STDERR);
$|++; # autoflush STDERR, to be sure
select($previous_default); # restore previous default
}
sub command_with_braces($)
{
my $command = shift;
if ($command =~ /^[a-z]/i) {
return "\@".$command.'{}';
} else {
return "\@".$command;
}
}
my %ignored_elements = (
'prepend' => 1,
'formalarg' => 1,
# not ignored everytime
'indexterm' => 1,
);
my %elements_end_attributes = (
'accent' => 1,
'menunode' => 1,
'menutitle' => 1,
);
# keys are markup elements. If the element is associated to one @-command
# only, the value is a string, the corresponding @-command formatted.
# If the element is associated to more than one element, the value is a
# hash to select the command based on an attribute value. They key of the
# hash attribute is an attribute name and the value is another hash
# reference which associates an attribute value to the formatted @-command
# string.
my %element_at_commands;
# entities not associated to @-commands
my %entity_texts = (
'textldquo' => '``',
'textrdquo' => "''",
'textmdash' => '---',
'textndash' => '--',
'textrsquo' => "'",
'textlsquo' => '`',
'formfeed' => "\f",
'verticaltab' => "\x{000B}",
# following mappings are not used in pratice, as attrformfeed and similar
# appear in attributes and thus are already expanded to text.
'attrformfeed' => "\f",
'attrverticaltab' => "\x{000B}",
);
# contains nobrace symbol and brace noarg commands
my %no_arg_commands_formatting
= %Texinfo::Convert::TexinfoMarkup::no_arg_commands_formatting;
foreach my $command (keys(%no_arg_commands_formatting)) {
if (!ref($no_arg_commands_formatting{$command})) {
$entity_texts{$no_arg_commands_formatting{$command}}
= command_with_braces($command);
} else {
my $spec = $no_arg_commands_formatting{$command};
my $element = $spec->[0];
if ($element eq 'spacecmd') {
if ($spec->[1]->[0] eq 'type') {
$element_at_commands{$element}->{"type"}->{$spec->[1]->[1]}
= command_with_braces($command);
} else {
die "BUG, bad spacecmd specification";
}
} else {
$element_at_commands{$element} = command_with_braces($command);
}
}
}
my %arg_elements;
my %variadic_elements;
foreach my $command (keys(%Texinfo::Convert::TexinfoMarkup::commands_args_elements)) {
my $arg_index = 0;
my @element_arguments
= @{$Texinfo::Convert::TexinfoMarkup::commands_args_elements{$command}};
foreach my $element_argument (@element_arguments) {
if ($element_argument ne '*') {
# need to disambiguate by command as some markup argument elements
# are common, for @*ref, for example. It should not matter, in general,
# if the argument indices are the same, which should be the case, but
# it is cleaner anyway.
$arg_elements{$element_argument}->{$command} = $arg_index;
} else {
my $previous_element_argument = $element_arguments[$arg_index - 1];
$variadic_elements{$previous_element_argument}->{$command} = $arg_index;
}
$arg_index++;
}
}
my %accent_type_command;
foreach my $accent_command (keys(%Texinfo::Convert::TexinfoMarkup::accent_types)) {
$accent_type_command{$Texinfo::Convert::TexinfoMarkup::accent_types{$accent_command}}
= $accent_command;
}
my %eat_space_elements;
foreach my $element ('texinfo', 'filename') {
$eat_space_elements{$element} = 1;
}
my $infile = shift @ARGV;
if (!defined($infile) or $infile !~ /\S/) {
die "Missing file\n";
}
my $reader_options = {'location' => $infile,
'expand_entities' => 0,
'pedantic_parser' => $debug,
#'validation' => 1,
};
my $reader = XML::LibXML::Reader->new($reader_options)
or die "cannot read $infile\n";
#(my $mydir = $0) =~ s,/[^/]*$,,; # dir we are in
#my $txi_dtd_libdir = "$mydir"; # find tp relative to $0
sub skip_until_end($$)
{
my $reader = shift;
my $name = shift;
while ($reader->read) {
if ($reader->nodeType() eq XML_READER_TYPE_END_ELEMENT
and $reader->name eq $name) {
return;
}
}
}
sub unprotect_spaces($)
{
my $spaces = shift;
if (defined($spaces)) {
$spaces =~ s/\\n/\n/g;
# convert back formfeed and other special characters
$spaces =~ s/\\f/\f/g;
$spaces =~ s/\\v/\x{000B}/g;
return $spaces;
} else {
return '';
}
}
my $eat_space = 0;
my $skip_comment = 0;
my @commands_with_args_stack;
while ($reader->read) {
# ============================================================ begin debug
if ($debug) {
my $args_stack_str = join('|', map {'['.$_->[0].','.$_->[1].']'}
@commands_with_args_stack);
printf STDERR "(args: $args_stack_str) (eat_space $eat_space) (skip_comment $skip_comment) %d %d %s %d", (
$reader->depth, $reader->nodeType, $reader->name, $reader->isEmptyElement);
my $value = '';
if ($reader->hasValue()) {
$value = $reader->value();
$value =~ s/\n/\\n/g;
print STDERR " |$value|";
}
if ($reader->nodeType() eq XML_READER_TYPE_ELEMENT
and $reader->hasAttributes()
and defined($reader->getAttribute('spaces'))) {
my $spaces = $reader->getAttribute('spaces');
print STDERR " spaces:$spaces|";
}
print STDERR "\n";
}
# ============================================================ end debug
if ($reader->nodeType() eq XML_READER_TYPE_SIGNIFICANT_WHITESPACE
and $eat_space) {
$eat_space = 0;
next;
} elsif ($reader->nodeType() eq XML_READER_TYPE_TEXT
or $reader->nodeType() eq XML_READER_TYPE_WHITESPACE
or $reader->nodeType() eq XML_READER_TYPE_SIGNIFICANT_WHITESPACE
) {
if ($reader->hasValue()) {
print $reader->value();
}
}
my $name = $reader->name;
if ($reader->nodeType() eq XML_READER_TYPE_ELEMENT) {
my $user_defined_index_command;
if (($name eq 'entry' or $name eq 'indexcommand')
and $reader->hasAttributes()
and defined($reader->getAttribute('command'))) {
$name = $reader->getAttribute('command');
$user_defined_index_command = 1;
} elsif ($name eq 'listitem') {
$name = 'item';
}
if ($Texinfo::Convert::TexinfoMarkup::commands_args_elements{$name}) {
push @commands_with_args_stack, [$name, 0];
}
my $spaces = unprotect_spaces($reader->getAttribute('spaces'));
my $spaces_after_command
= unprotect_spaces($reader->getAttribute('spacesaftercmd'));
if ($name eq 'accent') {
if ($reader->hasAttributes()) {
if (defined($reader->getAttribute('type'))) {
my $command = $accent_type_command{$reader->getAttribute('type')};
print "\@${command}${spaces_after_command}"
if (defined($command));
}
if (!(defined($reader->getAttribute('bracketed'))
and $reader->getAttribute('bracketed') eq 'off')) {
print '{';
}
} else {
print '{';
}
} elsif (exists $element_at_commands{$name}) {
if (!ref($element_at_commands{$name})) {
print $element_at_commands{$name};
} else {
my ($attribute) = keys(%{$element_at_commands{$name}});
if ($reader->hasAttributes()
and defined($reader->getAttribute($attribute))) {
print
$element_at_commands{$name}->{$attribute}->{
$reader->getAttribute($attribute)};
}
}
} elsif (exists($Texinfo::Commands::brace_commands{$name})) {
print "\@${name}${spaces_after_command}".'{';
if ($name eq 'verb' and $reader->hasAttributes()
and defined($reader->getAttribute('delimiter'))) {
print $reader->getAttribute('delimiter');
}
print "$spaces";
} elsif (exists($Texinfo::Commands::block_commands{$name})) {
print "\@$name";
if ($name eq 'macro' or $name eq 'rmacro') {
if ($reader->hasAttributes() and defined($reader->getAttribute('line'))) {
print $reader->getAttribute('line');
}
} else {
# leading spaces are already in the line attribute for (r)macro
print "$spaces";
}
} elsif (defined($Texinfo::Commands::line_commands{$name})
or defined($Texinfo::Commands::nobrace_commands{$name})
or $user_defined_index_command) {
if ($reader->hasAttributes()
and defined($reader->getAttribute('originalcommand'))) {
$name = $reader->getAttribute('originalcommand');
}
if ($name eq 'documentencoding' and $reader->hasAttributes()
and defined($reader->getAttribute('encoding'))) {
my $perl_encoding
= Encode::resolve_alias($reader->getAttribute('encoding'));
if (defined($perl_encoding)) {
if ($debug) {
print STDERR "Using encoding $perl_encoding\n";
}
binmode(STDOUT, ":encoding($perl_encoding)");
}
}
print "\@$name$spaces";
if ($reader->hasAttributes() and defined($reader->getAttribute('line'))) {
my $line = $reader->getAttribute('line');
$line =~ s/\\\\/\x{1F}/g;
# convert back formfeed
$line =~ s/\\f/\f/g;
$line =~ s/\x{1F}/\\/g;
# FIXME needed?
#$line =~ s/\\v/\x{000B}/g;
print $line;
}
my $specific_line = (defined($Texinfo::Commands::line_commands{$name})
and $Texinfo::Commands::line_commands{$name} eq 'specific');
if ($name eq 'set' or $name eq 'clickstyle' or $name eq 'columnfractions'
or $specific_line) {
skip_until_end($reader, $name);
if ($name eq 'columnfractions' or $specific_line) {
# specific line commands have a line argument obtained by converting
# their line to Texinfo, which would include a comment on the line,
# and could also have a comment associated to the command appearing
# after the command as an XML comment. Similar for columnfraction.
#
# We skip the possibly existing redundant XML comment following the
# closing element.
#
# start at 2 as there is a -1 right down at the end of
# the loop, and another -1 for the next element (possibly
# an ignored comment).
$skip_comment = 2;
}
}
} elsif ($arg_elements{$name}) {
# elements corresponding to @-commands arguments
if ($reader->hasAttributes()
and defined($reader->getAttribute('automatic'))
and $reader->getAttribute('automatic') eq 'on') {
skip_until_end($reader, $name);
next;
}
my ($command, $current_index) = @{$commands_with_args_stack[-1]};
my $arg_element_index = $arg_elements{$name}->{$command};
if ($commands_with_args_stack[-1]->[1] < $arg_element_index) {
while ($commands_with_args_stack[-1]->[1] < $arg_element_index) {
$commands_with_args_stack[-1]->[1]++;
print ',';
}
} elsif ($commands_with_args_stack[-1]->[1] > 0) {
# the index is already at or above the argument index. Either it is
# a variadic command, or an incorrect input.
if ($variadic_elements{$name}
and defined($variadic_elements{$name}->{$command})) {
$commands_with_args_stack[-1]->[1]++;
print ',';
# a debug consistency check
my $variadic_arg_index = $variadic_elements{$name}->{$command};
if ($commands_with_args_stack[-1]->[1] < $variadic_arg_index) {
print STDERR "BUG: $command: $name: current index < variadic_arg_index: "
."$commands_with_args_stack[-1]->[1] < $variadic_arg_index\n"
if ($debug);
}
} else {
# could happen with duplicate markup argument elements and
# markup argument elements in the wrong order
print STDERR "BAD INPUT: $command: $name(not variadic): "
."current index >= arg_element_index: "
."$commands_with_args_stack[-1]->[1] >= $arg_element_index\n"
if ($debug);
}
}
print "$spaces";
} elsif ($ignored_elements{$name}) {
my $keep_indexterm = 0;
if ($name eq 'indexterm') {
my $node_path = $reader->nodePath();
if ($node_path =~ m:([a-z]+)/indexterm$:) {
my $parent = $1;
if ($parent =~ /^[a-z]?[a-z]index$/ or $parent eq 'indexcommand') {
$keep_indexterm = 1;
}
}
}
if (!$keep_indexterm) {
#print STDERR "IGNORE $name\n" if ($debug);
skip_until_end($reader, $name);
next;
}
} elsif ($name eq 'formattingcommand') {
if ($reader->hasAttributes()) {
if (defined($reader->getAttribute('command'))
and (not (defined($reader->getAttribute('automatic'))
and $reader->getAttribute('automatic') eq 'on'))) {
print '@'.$reader->getAttribute('command');
}
}
} elsif ($name eq 'infoenclose') {
if ($reader->hasAttributes()
and defined($reader->getAttribute('command'))) {
my $command = $reader->getAttribute('command');
print "\@${command}${spaces_after_command}".'{'."$spaces";
}
# def* automatic
} elsif ($reader->hasAttributes()
and defined($reader->getAttribute('automatic'))
and $reader->getAttribute('automatic') eq 'on') {
skip_until_end($reader, $name);
# eat the following space
$reader->read();
} elsif ($eat_space_elements{$name}) {
$eat_space = 1;
} else {
print STDERR "UNKNOWN $name\n" if ($debug);
}
if ($reader->hasAttributes()) {
if (defined($reader->getAttribute('bracketed'))
and $reader->getAttribute('bracketed') eq 'on') {
print '{'."$spaces";
}
# menus 'star' and following spaces
if (defined($reader->getAttribute('leadingtext'))) {
print $reader->getAttribute('leadingtext');
}
}
if ($Texinfo::Commands::block_commands{$name}
and $Texinfo::Commands::block_commands{$name} eq 'item_line'
and $reader->hasAttributes()
and defined($reader->getAttribute('commandarg'))) {
# happens when formatting command argument is missing and there
# are no spaces.
print ' ' if ($spaces eq '');
print '@'.$reader->getAttribute('commandarg');
}
} elsif ($reader->nodeType() eq XML_READER_TYPE_END_ELEMENT) {
if ($Texinfo::Convert::TexinfoMarkup::commands_args_elements{$name}) {
pop @commands_with_args_stack;
}
my $trailingspaces = '';
if ($reader->hasAttributes()
and defined($reader->getAttribute('trailingspaces'))) {
$trailingspaces
= unprotect_spaces($reader->getAttribute('trailingspaces'));
}
if ($reader->hasAttributes()) {
if (defined($reader->getAttribute('bracketed'))
and $reader->getAttribute('bracketed') eq 'on') {
print "$trailingspaces";
# such that spaces are not prepended below when prepended
# for elements without bracketed attribute below
$trailingspaces = '';
print '}';
}
}
if (exists ($Texinfo::Commands::brace_commands{$name})) {
if ($name eq 'verb' and $reader->hasAttributes()
and defined($reader->getAttribute('delimiter'))) {
print $reader->getAttribute('delimiter');
}
print '}';
} elsif (exists($Texinfo::Commands::block_commands{$name})) {
my $end_spaces;
if ($reader->hasAttributes()
and defined($reader->getAttribute('endspaces'))) {
$end_spaces = $reader->getAttribute('endspaces');
}
$end_spaces = ' ' if (!defined($end_spaces) or $end_spaces eq '');
print "\@end".$end_spaces."$name";
} elsif (defined($Texinfo::Commands::line_commands{$name})
or defined($Texinfo::Commands::nobrace_commands{$name})) {
if ($Texinfo::Commands::root_commands{$name} and $name ne 'node') {
$eat_space = 1;
}
print "$trailingspaces";
} elsif ($elements_end_attributes{$name}) {
if ($name eq 'accent') {
if ($reader->hasAttributes()) {
if (!(defined($reader->getAttribute('bracketed'))
and $reader->getAttribute('bracketed') eq 'off')) {
print '}';
}
} else {
print '}';
}
} elsif ($reader->hasAttributes()
and defined($reader->getAttribute('separator'))) {
print $reader->getAttribute('separator');
print "$trailingspaces";
}
} elsif ($name eq 'infoenclose') {
print "$trailingspaces".'}';
} elsif ($eat_space_elements{$name}) {
$eat_space = 1;
} else {
print STDERR "END UNKNOWN $name\n" if ($debug);
print "$trailingspaces";
}
} elsif ($reader->nodeType() eq XML_READER_TYPE_ENTITY_REFERENCE) {
# for some reason XML_READER_TYPE_ENTITY is never emitted
# or $reader->nodeType() eq XML_READER_TYPE_ENTITY) {
if (defined($entity_texts{$name})) {
print $entity_texts{$name};
}
} elsif ($reader->nodeType() eq XML_READER_TYPE_COMMENT) {
my $comment;
if ($reader->hasValue()) {
$comment = $reader->value();
$comment =~ s/^ (comment|c)//;
my $command = $1;
$comment =~ s/ $//;
print "\@${command}$comment" unless ($skip_comment);
}
} elsif ($reader->nodeType() eq XML_READER_TYPE_DOCUMENT_TYPE) {
$eat_space = 1;
}
$skip_comment-- if ($skip_comment);
}
1;