# ParagraphNonXS.pm: handle paragraph text. # # Copyright 2010-2023 Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Original author: Patrice Dumas # this module has nothing Texinfo specific. In contrast with existing # modules Text::Wrap, Text::Format, it keeps a state of the paragraph # and waits for text to be fed into it. package Texinfo::Convert::Paragraph; use 5.006; use strict; use if $] >= 5.014, re => '/a'; # ASCII-only character classes in regexes # To check if there is no erroneous autovivification #no autovivification qw(fetch delete exists store strict); use Unicode::EastAsianWidth; use Texinfo::Convert::Unicode; use Carp qw(cluck); # initialize a paragraph object. sub new($;$) { my $class = shift; my $conf = shift; my $self = {'max' => 72, 'indent_length' => 0, 'counter' => 0, 'word_counter' => 0, 'space' => '', 'frenchspacing' => 0, 'lines_counter' => 0, 'end_line_count' => 0, 'unfilled' => 0, 'last_letter' => '' }; if (defined($conf)) { foreach my $key (keys(%$conf)) { $self->{$key} = $conf->{$key}; } } bless $self, $class; } # for debugging sub dump($) { my $self = shift; print STDERR "para ($self->{'counter'}+$self->{'word_counter'}) " ."word: ".(defined($self->{'word'}) ? $self->{'word'} : 'UNDEF') .", space `$self->{'space'}' " ."end_sentence: ".(defined($self->{'end_sentence'}) ? $self->{'end_sentence'} : 'UNDEF')."\n"; } sub _cut_line($) { my $paragraph = shift; return '' if ($paragraph->{'ignore_columns'}); return _end_line($paragraph); } sub end_line_count($) { my $paragraph = shift; return $paragraph->{'end_line_count'}; } sub counter($) { my $paragraph = shift; return $paragraph->{'counter'}; } sub end_line($) { my $paragraph = shift; $paragraph->{'end_line_count'} = 0; return _end_line($paragraph); } # end a line. sub _end_line($) { my $paragraph = shift; $paragraph->{'counter'} = 0; $paragraph->{'space'} = ''; if (defined($paragraph->{'indent_length_next'})) { $paragraph->{'indent_length'} = $paragraph->{'indent_length_next'}; delete $paragraph->{'indent_length_next'}; } $paragraph->{'lines_counter'}++; $paragraph->{'end_line_count'}++; # could be set to other values, anything that is not upper case. $paragraph->{'last_letter'} = "\n"; print STDERR "END_LINE\n" if ($paragraph->{'DEBUG'}); return "\n"; } sub get_pending($) { my $paragraph = shift; my $result = ''; if ($paragraph->{'space'}) { $result .= $paragraph->{'space'}; } if (defined($paragraph->{'word'})) { $result .= $paragraph->{'word'}; } return $result; } sub add_pending_word($;$) { my $paragraph = shift; my $add_spaces = shift; $paragraph->{'end_line_count'} = 0; return _add_pending_word($paragraph, $add_spaces); } # put a pending word and spaces in the result string. sub _add_pending_word($;$) { my $paragraph = shift; my $add_spaces = shift; my $result = ''; if (not defined($paragraph->{'word'}) and not $add_spaces) { return $result; } if ($paragraph->{'indent_length'} > $paragraph->{'counter'}) { $result .= ' ' x ($paragraph->{'indent_length'} - $paragraph->{'counter'}); $paragraph->{'counter'} = $paragraph->{'indent_length'}; print STDERR "INDENT($paragraph->{'counter'}+$paragraph->{'word_counter'})\n" if ($paragraph->{'DEBUG'}); delete $paragraph->{'space'} unless $paragraph->{'unfilled'}; } if ($paragraph->{'space'}) { $result .= $paragraph->{'space'}; $paragraph->{'counter'} += length($paragraph->{'space'}); print STDERR "ADD_SPACES($paragraph->{'counter'}+$paragraph->{'word_counter'})\n" if ($paragraph->{'DEBUG'}); } $paragraph->{'space'} = ''; if (defined($paragraph->{'word'})) { $result .= $paragraph->{'word'}; $paragraph->{'counter'} += $paragraph->{'word_counter'}; print STDERR "ADD_WORD[$paragraph->{'word'}]+$paragraph->{'word_counter'}" ." ($paragraph->{'counter'})\n" if ($paragraph->{'DEBUG'}); $paragraph->{'word'} = undef; $paragraph->{'word_counter'} = 0; } return $result; } # end a paragraph sub end($) { my $paragraph = shift; $paragraph->{'end_line_count'} = 0; print STDERR "PARA END\n" if ($paragraph->{'DEBUG'}); my $result = _add_pending_word($paragraph, $paragraph->{'add_final_space'}); # probably not really useful, but cleaner $paragraph->{'last_letter'} = ''; if (!$paragraph->{'no_final_newline'} and $paragraph->{'counter'} != 0) { $result .= "\n"; $paragraph->{'lines_counter'}++; $paragraph->{'end_line_count'}++; } return $result; } my $end_sentence_characters = quotemeta('.?!'); my $after_punctuation_characters = quotemeta('"\')]'); # Add $WORD to paragraph, returning the text to be added to the paragraph. # Any end of sentence punctuation in $WORD that should be allowed to end a # sentence but which would otherwise be preceded by an upper-case letter should # instead by preceded by a backspace character. sub add_next($;$$) { my $paragraph = shift; my $word = shift; my $transparent = shift; $paragraph->{'end_line_count'} = 0; return _add_next($paragraph, $word, $transparent); } # add a word (without wrapping). sub _add_next($;$$$) { my $paragraph = shift; my $word = shift; my $transparent = shift; my $newlines_impossible = shift; my $result = ''; if (!defined($word)) { return ''; } my $disinhibit = 0; # Reverse the insertion of any control characters in Plaintext.pm. if ($word =~ /\x08$/) { $disinhibit = 1; } $word =~ s/\x08//g; $paragraph->{'word'} .= $word; if (!$transparent) { if ($disinhibit) { $paragraph->{'last_letter'} = 'a'; } elsif ($word =~ /([^$end_sentence_characters$after_punctuation_characters]) [$end_sentence_characters$after_punctuation_characters]*$/ox) { # Save the last character in $word before punctuation $paragraph->{'last_letter'} = $1; } } if (!$newlines_impossible and $word =~ /\n/) { $result .= _add_pending_word ($paragraph); _end_line($paragraph); } else { $paragraph->{'word_counter'} += Texinfo::Convert::Unicode::string_width($word); # The $paragraph->{'counter'} != 0 is here to avoid having an # additional line output when the text is longer than the max. if ($paragraph->{'counter'} != 0 and $paragraph->{'counter'} + $paragraph->{'word_counter'} + length($paragraph->{'space'}) > $paragraph->{'max'}) { $result .= _cut_line($paragraph); } } if ($paragraph->{'DEBUG'}) { print STDERR "WORD+ $word -> " .(defined($paragraph->{'word'}) ? $paragraph->{'word'} : 'UNDEF')."\n"; } return $result; } sub remove_end_sentence($) { my $paragraph = shift; $paragraph->{'end_sentence'} = 0; } sub add_end_sentence($;$) { my $paragraph = shift; my $value = shift; $paragraph->{'end_sentence'} = $value; } sub allow_end_sentence($) { my $paragraph = shift; printf STDERR "ALLOW END SENTENCE\n" if $paragraph->{'DEBUG'}; $paragraph->{'last_letter'} = 'a'; # lower-case } sub set_space_protection($$;$$$$) { my $paragraph = shift; my $no_break = shift; my $ignore_columns = shift; my $keep_end_lines = shift; my $frenchspacing = shift; my $double_width_no_break = shift; $paragraph->{'no_break'} = $no_break if defined($no_break); $paragraph->{'ignore_columns'} = $ignore_columns if defined($ignore_columns); $paragraph->{'keep_end_lines'} = $keep_end_lines if defined($keep_end_lines); $paragraph->{'frenchspacing'} = $frenchspacing if defined($frenchspacing); $paragraph->{'double_width_no_break'} = $double_width_no_break if defined($double_width_no_break); # begin a word, to have something even if empty if ($no_break) { _add_next($paragraph, ''); } } # Wrap $TEXT, returning the wrapped text, taking into account the current state # of $PARAGRAPH. Any end of sentence punctuation in $TEXT that should be # allowed to end a sentence but which would otherwise be preceded by an # upper-case letter should instead by preceded by a backspace character. sub add_text($$) { my $paragraph = shift; my $text = shift; $paragraph->{'end_line_count'} = 0; my $result = ''; my @segments = split /(\s+)|(\p{InFullwidth})|((?:[^\s\p{InFullwidth}])+)/, $text; # Check now if a newline exists anywhere in the string to # try to eliminate regex checks later. my $newline_possible_flag = ($text =~ /\n/); my $debug_flag = $paragraph->{'DEBUG'}; while (@segments) { # $empty_segment should be an empty string; the other variables # here were recognized as field separators by split, the separator # set to something else than undef for the separator matching. my ($empty_segment, $spaces, $fullwidth_segment, $added_word) = splice (@segments, 0, 4); if ($debug_flag) { print STDERR "p ($paragraph->{'counter'}+$paragraph->{'word_counter'}) " ."s `" . _print_escaped_spaces($paragraph->{'space'})."', " ."l `$paragraph->{'last_letter'}', " ."w `".(defined($paragraph->{'word'}) ? $paragraph->{'word'} : 'UNDEF')."'\n"; } if (defined $spaces) { print STDERR "SPACES($paragraph->{'counter'}) `" ._print_escaped_spaces($spaces)."'\n" if $debug_flag; if ($paragraph->{'unfilled'}) { $result .= _add_pending_word($paragraph); if ($spaces =~ /\n/) { $result .= _end_line ($paragraph); } else { $paragraph->{'space'} .= $spaces; } } else { my $at_end_sentence = 0; $at_end_sentence = 1 if ($paragraph->{'end_sentence'} and $paragraph->{'end_sentence'} > 0 and !$paragraph->{'frenchspacing'}); if ($paragraph->{'no_break'}) { if (substr($paragraph->{'word'}, -1) ne ' ') { my $new_spaces = $at_end_sentence ? ' ' : ' '; $paragraph->{'word'} .= $new_spaces; $paragraph->{'word_counter'} += length($new_spaces); # The $paragraph->{'counter'} != 0 is here to avoid having an # additional line output when the text is longer than the max. if ($paragraph->{'counter'} != 0 and $paragraph->{'counter'} + $paragraph->{'word_counter'} + length($paragraph->{'space'}) > $paragraph->{'max'}) { $result .= _cut_line($paragraph); } } } else { my $pending_word = $paragraph->{'word'}; $result .= _add_pending_word($paragraph); if ($paragraph->{'counter'} != 0 or (defined $pending_word)) { if ($at_end_sentence) { $paragraph->{'space'} = ' '; } else { # Only save the first space if (length($paragraph->{'space'}) < 1) { if ($spaces =~ /\n/) { $paragraph->{'space'} = ' '; } else { $paragraph->{'space'} .= substr ($spaces, 0, 1); } } } } } } if ($paragraph->{'counter'} + length($paragraph->{'space'}) > $paragraph->{'max'}) { $result .= _cut_line($paragraph); } if ($newline_possible_flag and !$paragraph->{'unfilled'} and $paragraph->{'keep_end_lines'} and $spaces =~ /\n/) { $result .= _end_line($paragraph); } $paragraph->{'last_letter'} = ' '; } elsif (defined $added_word) { my $tmp = $added_word; # Prepend 'last_letter' to add the information on the last # letter even if it was read as part of a previous string # Add it here because _add_next overwrites it. Note that # if _add_next overwrited it, it wouldn't lead to an invalid # result, as the wrong prepended 'last_letter' would not match # at the end of the $added_word in the regex below anyway. $tmp = $paragraph->{'last_letter'} . $tmp; $result .= _add_next($paragraph, $added_word, undef, !$newline_possible_flag); # Check if it is considered as an end of sentence. There are two things # to check: one, that we have a ., ! or ?; and second, that it is not # preceded by an upper-case letter (ignoring some punctuation) if (defined($paragraph->{'end_sentence'}) and $added_word =~ /^[$after_punctuation_characters]*$/o) { # do nothing in the case of a continuation of after_punctuation_characters } elsif (!$paragraph->{'unfilled'} and $tmp =~ /(^|[^\p{Upper}$after_punctuation_characters$end_sentence_characters]) [$after_punctuation_characters]*[$end_sentence_characters] [$end_sentence_characters\x08$after_punctuation_characters]*$/ox) { if ($paragraph->{'frenchspacing'}) { $paragraph->{'end_sentence'} = -1; } else { $paragraph->{'end_sentence'} = 1; } print STDERR "END_SENTENCE\n" if ($paragraph->{'DEBUG'}); } else { print STDERR "delete END_SENTENCE($paragraph->{'end_sentence'})\n" if (defined($paragraph->{'end_sentence'}) and $paragraph->{'DEBUG'}); delete $paragraph->{'end_sentence'}; } } elsif (defined $fullwidth_segment) { print STDERR "FULLWIDTH\n" if ($paragraph->{'DEBUG'}); if (!defined($paragraph->{'word'})) { $paragraph->{'word'} = ''; } $paragraph->{'word'} .= $fullwidth_segment; $paragraph->{'word_counter'} += 2; # fullwidth latin letters can be upper case, so it is important to # use the actual characters here. $paragraph->{'last_letter'} = $fullwidth_segment; # We allow a line break in between Chinese characters even if # there was no space between them, unlike single-width # characters. if ($paragraph->{'counter'} != 0 and $paragraph->{'counter'} + $paragraph->{'word_counter'} > $paragraph->{'max'}) { $result .= _cut_line($paragraph); } if (!$paragraph->{'no_break'} and !$paragraph->{'double_width_no_break'}) { $result .= _add_pending_word($paragraph); } delete $paragraph->{'end_sentence'}; } } return $result; } # for debug sub _print_escaped_spaces($) { my $spaces = shift; my $result = ''; foreach my $pos (0 .. length($spaces)-1) { my $char = substr($spaces, $pos, 1); if ($char eq ' ') { $result .= $char; } elsif ($char =~ /[\f\n]/) { $char =~ s/\f/\\f/; $char =~ s/\n/\\n/; $result .= $char; } elsif ($char =~ /\s/) { if (ord($char) <= hex(0xFFFF)) { $result .= '\x'.sprintf("%04x",ord($char)); } else { $result .= '\x'.sprintf("%06x",ord($char)); } } else { $result .= $char; } } return $result; } 1;