diff --git a/Niveau-1/TXM/Extraction_Corps_TEI.pl b/Niveau-1/TXM/Extraction_Corps_TEI.pl new file mode 100755 index 0000000..94b8dc7 --- /dev/null +++ b/Niveau-1/TXM/Extraction_Corps_TEI.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; +use XML::Twig; +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $dir_input = ""; +my $dir_output = ""; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "input=s" => \$input, + "output=s" => \$output, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir_input or not $dir_output; + + + + +opendir(my $dh, $dir_input) or die "Couldn't open file $dir_input, $!"; + +my @files = sort grep { /*.tei$/ and -f "$dir_input/$_" } readdir($dh); +closedir($dh); + +my $size = @files; +print STDERR "Total : $size\n"; + +#il faut écrire "$dir/$file", pour entrer dans le file !!! +foreach my $file (@files) +{ + my $new_nom = $file; + $new_nom =~ s/\.tei/\.txt/; + print STDERR "$file => $new_nom\n"; + open(TXT, ">:encoding(UTF-8)", "$output/$new_nom") or + die "Couldn't open file $new_nom, $!"; + +#utiliser la module Twig ; prendre le contenu dans le balise p (le texte) + my $parser = XML::Twig->new + ( + twig_handlers => + { + 'text/body/div/p' => sub { + print TXT $_->first_child_text. "\n"; + } + } + ); + $parser->parsefile("$input/$file"); + + close(TXT); +} + +exit 0; + +sub usage +{ +my $code = shift; +print STDERR "Usage: $programme -i input -o output\n"; +exit $code; +} \ No newline at end of file