#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;
use XML::Twig;
use Encode;
use Getopt::Long;
my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
$programme = decode_utf8($programme);
my $dir_input = "";
my $dir_output = "";
eval {
$SIG{__WARN__} = sub {usage(1);};
GetOptions(
"input=s" => \$input,
"output=s" => \$output,
);
};
$SIG{__WARN__} = sub {warn $_[0];};
usage(2) if not $dir_input or not $dir_output;
opendir(my $dh, $dir_input) or die "Couldn't open file $dir_input, $!";
my @files = sort grep { /*.tei$/ and -f "$dir_input/$_" } readdir($dh);
closedir($dh);
my $size = @files;
print STDERR "Total : $size\n";
#il faut écrire "$dir/$file", pour entrer dans le file !!!
foreach my $file (@files)
{
my $new_nom = $file;
$new_nom =~ s/\.tei/\.txt/;
print STDERR "$file => $new_nom\n";
open(TXT, ">:encoding(UTF-8)", "$output/$new_nom") or
die "Couldn't open file $new_nom, $!";
#utiliser la module Twig ; prendre le contenu dans le balise p (le texte)
my $parser = XML::Twig->new
(
twig_handlers =>
{
'text/body/div/p' => sub {
print TXT $_->first_child_text. "\n";
}
}
);
$parser->parsefile("$input/$file");
close(TXT);
}
exit 0;
sub usage
{
my $code = shift;
print STDERR "Usage: $programme -i input -o output\n";
exit $code;
}