use strict; use warnings; my $VERSION; $VERSION = '1.01'; # This script converts Praat (http://www.fon.hum.uva.nl/praat/) annotation # format (.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format # (.lab) file. # See the POD documentation at the end of this file # or run `perl tg2lab.pl --man' # for more information. use Getopt::Long; use Pod::Usage; my %opt = ( help => 0, man => 0, i => undef, script => undef, ext => 'lab', verbose => 0, version => 0, ); GetOptions(\%opt, 'help|?', 'man', 'i=s', 'script|S=s', 'ext|x=s', 'verbose|v', 'version', ) or pod2usage(1); pod2usage(1) if $opt{help}; pod2usage(-exitstatus => 0, -verbose => 2) if $opt{man}; print "$VERSION\n" and exit(0) if $opt{version}; if (@ARGV == 0 && !$opt{script}) { pod2usage(1); exit(0); } my $tgFileName = ''; my $labFileName = ''; my $mlfFileName = $opt{i}; my $mlfFile; if ($mlfFileName) { if (!open($mlfFile, ">$mlfFileName")) { my $msg = "ERROR: Cannot create file $mlfFileName: $!.\n"; die $msg; } print $mlfFile "#!MLF!#\n"; close($mlfFile); } if (@ARGV > 0) { $tgFileName = $ARGV[0]; if (@ARGV > 1) { $labFileName = $ARGV[1]; if (@ARGV > 2) { warn "WARNING: Too many arguments is given to the script.\n"; } } else { $labFileName = ChangeFileExt($tgFileName, $opt{ext}); } print "\nConverting $tgFileName -> $labFileName\n"; tg2lab($tgFileName, $labFileName, $mlfFileName); print "Done.\n"; } if ($opt{script}) { my $scriptFileName = $opt{script}; if (-e $scriptFileName) { print "\nProcessing $scriptFileName. Please wait...\n"; my $scriptFile; if (!open($scriptFile, "<$scriptFileName")) { my $msg = "ERROR: Can't open file $scriptFileName for reading: $!.\n"; die $msg; } my $lineNo = 0; my $numFiles = 0; while (<$scriptFile>) { chomp; $lineNo++; if ($_ !~ /^\s*$/) { # if current line is not empty, then ... my @fields = (); @fields = ($_ =~ /^\s*("[^"]+"|\S+)\s*("[^"]+"|\S+)?s*$/); my $tgFileName = ''; my $labFileName = ''; if ($fields[0]) { $tgFileName = $fields[0]; $tgFileName =~ s/"//g; if ($fields[1]) { $labFileName = $fields[1]; $labFileName =~ s/"//g; } else { $labFileName = ChangeFileExt($tgFileName, $opt{ext}); } } else { my $msg = "Bad format of file $scriptFileName at line $lineNo.\n"; die $msg; } $numFiles++; if ($opt{verbose}) { print "$numFiles: Converting $tgFileName -> $labFileName\n"; } tg2lab($tgFileName, $labFileName, $mlfFileName); } } close($scriptFile); print "Done. $numFiles files processed.\n"; } else { my $msg = "ERROR: Can't find file $scriptFileName.\n"; die $msg; } } sub tg2lab { my $tgFileName = shift; my $labFileName = shift; my $mlfFileName = shift; # Reading TextGrid file my $tgFile; if (!open($tgFile, "<$tgFileName")) { my $msg = "ERROR: Can't open file $tgFileName for reading: $!.\n"; die $msg; } my @lines = <$tgFile>; close($tgFile); # Analyzing TextGrid file my $l = 0; # index of current line chomp(@lines); if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*File\s+type\s*=\s*"ooTextFile"\s*$/) { my $msg = "ERROR: Bad header of file $tgFileName: 'File type = \"ooTextFile\"' not found.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*Object\s+class\s*=\s*"TextGrid"\s*$/) { my $msg = "ERROR: Bad header of file $tgFileName: 'Object class = \"TextGrid\"' not found.\n"; die $msg; } if (not exists($lines[$l])) { my $msg = "ERROR: File $tgFileName is corrupt.\n"; die $msg; } if ($lines[$l] =~ /^\s*$/) { $l++; } my $xmin_pattern = '^\s*xmin\s*=\s*([\d\.,eE\+-]+)\s*$'; my $xmax_pattern = '^\s*xmax\s*=\s*([\d\.,eE\+-]+)\s*$'; if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) { my $msg = "ERROR: Cannot read global xmin value.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) { my $msg = "ERROR: Cannot read global xmax value.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*tiers\?\s*\s*$/) { my $msg = "ERROR: 'tiers? ' not found.\n"; die $msg; } my $items_size; if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*size\s*=\s*(\d+)\s*$/) { my $msg = "ERROR: Cannot read tiers size value.\n"; die $msg; } $items_size = $1; if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*item\s*\[\]:\s*$/) { my $msg = "ERROR: 'item []: ' not found.\n"; die $msg; } my @items = (); my $i = 1; # index of current item while ($i <= $items_size) { if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*item\s*\[$i\]:\s*$/) { my $msg = "ERROR: 'item [$i]:' not found.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*class\s*=\s*"IntervalTier"\s*$/) { my $msg = "ERROR: 'class = \"IntervalTier\"' of 'item [$i]' not found.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*name\s*=\s*".+"\s*$/) { my $msg = "ERROR: Cannot read 'name' of 'item [$i]'.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) { my $msg = "ERROR: Cannot read xmin value of 'item [$i]'.\n"; die $msg; } if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) { my $msg = "ERROR: Cannot read xmax value of 'item [$i]'.\n"; die $msg; } my $intervals_size = 0; if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*intervals:\s*size\s*=\s*(\d+)\s*$/) { my $msg = "ERROR: Cannot read 'intervals: size' of 'item [$i]'.\n"; die $msg; } $intervals_size = $1; my @intervals = (); my $j = 1; # index of current interval in item while ($j <= $intervals_size) { if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*intervals\s*\[$j\]:\s*$/) { my $msg = "ERROR: 'intervals [$j]:' of 'item [$i]:' not found.\n"; die $msg; } my $xmin = 0; my $xmax = 0; my $text = 0; if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) { my $msg = "ERROR: Cannot read xmin value of 'intervals [$j]:' of 'item [$i]'.\n"; die $msg; } $xmin = $1; if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) { my $msg = "ERROR: Cannot read xmax value of 'intervals [$j]:' of 'item [$i]'.\n"; die $msg; } $xmax = $1; if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*text\s*=\s*"(.+)"\s*$/) { my $msg = "ERROR: Cannot read xmax value of 'intervals [$j]:' of 'item [$i]'.\n"; die $msg; } $text = $1; $text =~ s/""/"/g; my %interval = ( xmin => $xmin, xmax => $xmax, text => $text, ); push @intervals, { %interval }; $j++; } push @items, [ @intervals ]; $i++; } # sort items by intervals size if ($#items > 0) { for ($i = 0; $i < $#items; $i++) { for (my $j = $#items; $j > $i; $j--) { if ( $#{$items[$i]} < $#{$items[$j]} ) { my $tmp_item = $items[$i]; $items[$i] = $items[$j]; $items[$j] = $tmp_item; } } } } my @k = (); # auxiliary array for ($i = 0; $i <= $#items; $i++) { $k[$i] = 0; } my ($labFile, $mlfFile); if (!open($labFile, ">$labFileName")) { my $msg = "ERROR: Can't create file $labFileName: $!.\n"; die $msg; } if ($mlfFileName) { if (!open($mlfFile, ">>$mlfFileName")) { my $msg = "ERROR: Cannot open file $mlfFileName for appending: $!.\n"; die $msg; } print $mlfFile "\"$labFileName\"\n"; } my ($diff, $next_diff) = 0; for (my $j = 0; $j <= $#{$items[0]}; $j++) { my $xmin = sprintf "%0.0f", $items[0][$j]{xmin} * 1.0e7; my $xmax = sprintf "%0.0f", $items[0][$j]{xmax} * 1.0e7; my $t = "$xmin $xmax $items[0][$j]{text}"; print $labFile $t; if ($mlfFileName) { print $mlfFile $t; } for ($i = 1; $i <= $#items; $i++) { if ($k[$i] <= $#{$items[$i]}) { $diff = abs($items[$i][$k[$i]]{xmin} - $items[0][$j]{xmin}); if ($j+1 <= $#{$items[0]}) { $next_diff = abs($items[$i][$k[$i]]{xmin} - $items[0][$j+1]{xmin}); } if ($diff <= $next_diff) { $t = " $items[$i][$k[$i]]{text}"; print $labFile $t; if ($mlfFileName) { print $mlfFile $t; } $k[$i]++; } } } print $labFile "\n"; if ($mlfFileName) { print $mlfFile "\n"; } } if ($mlfFileName) { print $mlfFile ".\n"; } close($labFile); if ($mlfFileName) { close($mlfFile); } } sub ChangeFileExt { my $fileName = shift; my $ext = shift; if ($fileName =~ s/\.[-\w\ ]*$/\.$ext/) { } else { $fileName = $fileName.'.'.$ext; } return $fileName; } __END__ =head1 NAME tg2lab.pl - convert Praat (http://www.fon.hum.uva.nl/praat/) annotation format (.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab) file. =head1 SYNOPSIS =over =item B [I] I [I] =item B [I] I<--script f> [I] [I] =back =head1 DESCRIPTION Converts Praat (http://www.fon.hum.uva.nl/praat/) annotation format (.TextGrid) file I to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab) file I. If I (output) file name is not provided, I (source) file name will be used but with different extension (C<.lab> by default). Script file I can be used for batch conversion of multiple files. In this case, a list of each source file and (optional) its corresponding output file should be provided in the script file. =head1 OPTIONS =over =item B<-i> I Output transcriptions to Master Label File (MLF) I. The default is off. =item B<-S> I, B<--script> I Set script file to I. The script file I can be used for batch conversion of multiple files. In this case, a list of each source file and (optional) its corresponding output file should be provided in the script file. The default is none. =item B<-x> I, B<--ext> I Set default TextGrid output file extension to I. The default is C<.TextGrid>. =item B<-v>, B<--verbose> Verbose output to the screen. The default is off. =item B<-?>, B<--help> Prints the B and B sections. =item B<--man> Prints the tg2lab.pl manual. =item B<--version> Prints the current version number of tg2lab.pl and exits. =back =head1 HISTORY v1.01 (20090701): Fixed SCRIPT CATEGORIES section of POD documentation. v1.00 (20090629): First public release. =head1 AUTHOR Mark Filipovic > =head1 COPYRIGHT Copyright (c) 2009 Mark Filipovic. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =begin CPAN =head1 README This script converts Praat (http://www.fon.hum.uva.nl/praat/) annotation format (.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab) file. =head1 PREREQUISITES This script requires C, C, C, and C modules. =head1 OSNAMES any =head1 SCRIPT CATEGORIES Speech/Annotation Speech/Labelling =end CPAN =cut