Newer
Older
explore-corpus / Niveau-1 / TXM / Extraction_Corps_TEI.pl
@Pan Pan Hu Pan Pan Hu on 21 Jul 2017 1 KB second commit
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;
use XML::Twig;
use Encode;
use Getopt::Long;

my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
$programme = decode_utf8($programme);

my $dir_input = "";
my $dir_output = "";

eval	{
	$SIG{__WARN__} = sub {usage(1);};
		GetOptions(
		"input=s"      => \$input,
		"output=s"      => \$output,
		);
	};
$SIG{__WARN__} = sub {warn $_[0];};

usage(2) if not $dir_input or not $dir_output;




opendir(my $dh, $dir_input) or die "Couldn't open file $dir_input, $!";

my @files = sort grep { /*.tei$/ and -f "$dir_input/$_" } readdir($dh);
closedir($dh);

my $size = @files;
print STDERR "Total : $size\n";

#il faut écrire "$dir/$file", pour entrer dans le file !!!
foreach my $file (@files)
{
	my $new_nom = $file;
	$new_nom =~ s/\.tei/\.txt/;
	print STDERR "$file => $new_nom\n";
	open(TXT, ">:encoding(UTF-8)", "$output/$new_nom") or 
	die "Couldn't open file $new_nom, $!";
	
#utiliser la module Twig ; prendre le contenu dans le balise p (le texte)
	my $parser = XML::Twig->new
	(
		twig_handlers =>
			{
				'text/body/div/p' => sub {
					print TXT $_->first_child_text. "\n";
				}
			}
	);
	$parser->parsefile("$input/$file");

	close(TXT);
}

exit 0;

sub usage
{
my $code = shift;
print STDERR "Usage: $programme -i input -o output\n";
exit $code;
}