Newer
Older
explore-corpus / Niveau-3 / TXM / Extraction_Corps_TEI.pl
@Pan Pan Hu Pan Pan Hu on 26 Jul 2017 1 KB fourth commit
#!/usr/bin/perl

use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;
use Encode;
use Getopt::Long;
use XML::Twig;

my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;
$programme = decode_utf8($programme);

my $input  = "";
my $output = "";

eval	{
	$SIG{__WARN__} = sub {usage(1);};
		GetOptions(
		"input=s"      => \$input,
		"output=s"     => \$output,
		);
	};
$SIG{__WARN__} = sub {warn $_[0];};

usage(2) if not $input or not $output;

opendir(my $dh, $input) or die "Couldn't open directory $input, $!";

my @files = sort grep { /tei$/ and -f "$input/$_" } readdir($dh);
closedir($dh);

my $size = @files;
print STDERR "Total : $size\n";

#il faut écrire "$dir/$file", pour entrer dans le file !!!
foreach my $file (@files)
{
	my $new_nom = $file;
	$new_nom =~ s/\.tei/\.txt/;
	print STDERR "$file => $new_nom\n";
	open(TXT, ">:encoding(UTF-8)", "$output/$new_nom") or 
	die "Couldn't open file $new_nom, $!";
	

#utiliser la module Twig ; prendre le contenu dans le balise p (le texte)
	my $parser = XML::Twig->new
	(
		twig_handlers =>
			{
				'text/body/div/p' => sub {
					# my $text = $_->first_child_text;
					print TXT $_->first_child_text. "\n";
				}
			}
	);
	$parser->parsefile("$input/$file");

	close(TXT);
}

exit 0;

sub usage
{
my $code = shift;
print STDERR "Usage: $programme -i input -o output\n";
exit $code;
}