diff --git a/Niveau-2/Polaris/Extraction_metadata.pl b/Niveau-2/Polaris/Extraction_metadata.pl new file mode 100644 index 0000000..f25dda9 --- /dev/null +++ b/Niveau-2/Polaris/Extraction_metadata.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $metadata = ""; +my $sortie = ""; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "metadata=s" => \$metadata, + "sortie=s" => \$sortie, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $metadata or not $sortie; + +open(META, "<:utf8", $metadata) or die "Couldn't open file \"$metadata\", $!"; +open(SORTIE, "+<:utf8", $sortie) or die "Couldn't open file \"$sortie\", $!"; + +foreach my $ligne () +{ + chomp($ligne); + if ($ligne =~ /^DT : (.*)/) + { + print SORTIE "**** *DT_"; + my $dt = $1; + $dt =~ s/ ; /_/g; + print SORTIE $dt; + print SORTIE " *SO_" + #my $dt = $1; + #print SORTIE "**** *DT_" + #print SORTIE $dt; + } + elsif ($ligne =~ /^SO : (.*?) ;/) + { + #$valeurs[1] = "$1"; + #print SORTIE " *SO_" + #print SORTIE $valeurs[1]; + my $so = $1; + $so =~ s/ /_/g; + print SORTIE $so; + print SORTIE " *LA_" + } + elsif ($ligne =~ /^LA : (.*)/) + { + #$valeurs[2] = "$1"; + #print SORTIE " *LA_" + #print SORTIE $valeurs[2]; + my $la = $1; + $la =~ s/ /_/g; + print SORTIE $la; + print SORTIE "\n"; + } +} + +close META; +close SORTIE; + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -m metadata -s sortie \n"; + +exit $code; +} diff --git a/Niveau-2/Polaris/Preparation_entree_Ira_polaris.pl b/Niveau-2/Polaris/Preparation_entree_Ira_polaris.pl new file mode 100644 index 0000000..1873102 --- /dev/null +++ b/Niveau-2/Polaris/Preparation_entree_Ira_polaris.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $dir = ""; +my $metadata = ""; +my $sortie = ""; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "dir=s" => \$dir, + "metadata=s" => \$metadata, + "sortie=s" => \$sortie, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir or not $metadata or not $sortie; + +#ouverture du dossier contenant les fichier .txt + +opendir(DIR, $dir) or die "Couldn't open file $dir, $!"; +my @files = sort grep {not /^\./} readdir(DIR); +closedir(DIR); +my $size = @files; +print STDERR "Total : $size\n"; + +#ouverture des métadonnées +open(METADATA, "<:utf8", $metadata) or die "Couldn't open file $metadata, $!"; + +#ouverture d'un fichier pour saisir des données +open(DATA, ">:utf8", $sortie) or die "Couldn't open file $sortie, $!"; + +my $i = 0; +while (my $metadata = ) { + print DATA $metadata; + open(FILE, "<:utf8", "$dir/$files[$i]") or die "Couldn't open file $files[$i], $!"; + while(my $ligne = ){ + #suppression des étoiles et des chiffres + $ligne =~ s/\*//; + print DATA $ligne; + } + close(FILE); + $i++; +} + +close(METADATA); +close(DATA); + +exit 0; + + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -d répertoire -m métadonnées -s sortie\n\n"; + +exit $code; +} diff --git a/Niveau-2/Polaris/ReadMe b/Niveau-2/Polaris/ReadMe new file mode 100644 index 0000000..b53e939 --- /dev/null +++ b/Niveau-2/Polaris/ReadMe @@ -0,0 +1,3 @@ +Le script Extraction_metadata.pl sert à extraire certaines médadonnées (DT>type de document, SO>source, LA>langue) comme variable depuis polarisRecent_modifié.txt pour l'entrée d'Iramuteq. + +Le script Preparation_entree_Ira_polaris.pl sert à concatener les articles dans le corpus Polaris et les médadonnées extraites par le script précédent. diff --git a/Niveau-2/post_traitement_corpus/rename_file.pl b/Niveau-2/post_traitement_corpus/rename_file.pl new file mode 100644 index 0000000..e44ee02 --- /dev/null +++ b/Niveau-2/post_traitement_corpus/rename_file.pl @@ -0,0 +1,75 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $source = ""; #par exemple "Vieil_v2.source" +my $dir = ""; #par exemple "Vv2_new" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "source=s" => \$source, + "dir=s" => \$dir, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir or not $source; + +open(SOURCE, "<:encoding(UTF-8)", $source) + or die "Couldn't open file $source, $!"; + +chomp(my @sourcefile = ); +my $size = @sourcefile; +print STDERR $size; +close SOURCE; + +my %correspondance; +for(my $n=0 ; $n<= $#sourcefile ; $n++ ) +{ + $sourcefile[$n] =~ /(\w{40})\W*?(\w+)/; + $correspondance{$1} = $2; +} + +#print "$_ $correspondance{$_}\n" for (keys %correspondance);#imprimer une table d'hachage + +my $size_cles = keys %correspondance; +print STDERR $size_cles; + +opendir(DIR, $dir) or die "Couldn't open the directory $dir, $!"; +my @files = grep {$_ ne '.' and $_ ne '..'} readdir(DIR); +closedir(DIR); + +foreach my $file (@files){ + my $new = $file; + $new =~ s/(\w{40}).txt/$1/; + if ($correspondance{"$new"}) + { + $new = $correspondance{"$new"}; + rename("$dir/$file","$dir/$new.txt"); + print "$file => $new.txt\n"; + } + else + { + print "Pas de correspondance pour le fichier $file\n"; + } +} + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -s source -d dir\n"; + +exit $code; +} diff --git a/Polaris/Extraction_metadata.pl b/Polaris/Extraction_metadata.pl deleted file mode 100644 index f25dda9..0000000 --- a/Polaris/Extraction_metadata.pl +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use utf8; -use open qw/:std :utf8/; - -use Encode; -use Getopt::Long; - -my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; -$programme = decode_utf8($programme); - -my $metadata = ""; -my $sortie = ""; - -eval { - $SIG{__WARN__} = sub {usage(1);}; - GetOptions( - "metadata=s" => \$metadata, - "sortie=s" => \$sortie, - ); - }; -$SIG{__WARN__} = sub {warn $_[0];}; - -usage(2) if not $metadata or not $sortie; - -open(META, "<:utf8", $metadata) or die "Couldn't open file \"$metadata\", $!"; -open(SORTIE, "+<:utf8", $sortie) or die "Couldn't open file \"$sortie\", $!"; - -foreach my $ligne () -{ - chomp($ligne); - if ($ligne =~ /^DT : (.*)/) - { - print SORTIE "**** *DT_"; - my $dt = $1; - $dt =~ s/ ; /_/g; - print SORTIE $dt; - print SORTIE " *SO_" - #my $dt = $1; - #print SORTIE "**** *DT_" - #print SORTIE $dt; - } - elsif ($ligne =~ /^SO : (.*?) ;/) - { - #$valeurs[1] = "$1"; - #print SORTIE " *SO_" - #print SORTIE $valeurs[1]; - my $so = $1; - $so =~ s/ /_/g; - print SORTIE $so; - print SORTIE " *LA_" - } - elsif ($ligne =~ /^LA : (.*)/) - { - #$valeurs[2] = "$1"; - #print SORTIE " *LA_" - #print SORTIE $valeurs[2]; - my $la = $1; - $la =~ s/ /_/g; - print SORTIE $la; - print SORTIE "\n"; - } -} - -close META; -close SORTIE; - -exit 0; - -sub usage -{ -my $code = shift; - -print STDERR "Usage : $programme -m metadata -s sortie \n"; - -exit $code; -} diff --git a/Polaris/Preparation_entree_Ira_polaris.pl b/Polaris/Preparation_entree_Ira_polaris.pl deleted file mode 100644 index 1873102..0000000 --- a/Polaris/Preparation_entree_Ira_polaris.pl +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use utf8; -use open qw/:std :utf8/; - -use Encode; -use Getopt::Long; - -my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; -$programme = decode_utf8($programme); - -my $dir = ""; -my $metadata = ""; -my $sortie = ""; - -eval { - $SIG{__WARN__} = sub {usage(1);}; - GetOptions( - "dir=s" => \$dir, - "metadata=s" => \$metadata, - "sortie=s" => \$sortie, - ); - }; -$SIG{__WARN__} = sub {warn $_[0];}; - -usage(2) if not $dir or not $metadata or not $sortie; - -#ouverture du dossier contenant les fichier .txt - -opendir(DIR, $dir) or die "Couldn't open file $dir, $!"; -my @files = sort grep {not /^\./} readdir(DIR); -closedir(DIR); -my $size = @files; -print STDERR "Total : $size\n"; - -#ouverture des métadonnées -open(METADATA, "<:utf8", $metadata) or die "Couldn't open file $metadata, $!"; - -#ouverture d'un fichier pour saisir des données -open(DATA, ">:utf8", $sortie) or die "Couldn't open file $sortie, $!"; - -my $i = 0; -while (my $metadata = ) { - print DATA $metadata; - open(FILE, "<:utf8", "$dir/$files[$i]") or die "Couldn't open file $files[$i], $!"; - while(my $ligne = ){ - #suppression des étoiles et des chiffres - $ligne =~ s/\*//; - print DATA $ligne; - } - close(FILE); - $i++; -} - -close(METADATA); -close(DATA); - -exit 0; - - -sub usage -{ -my $code = shift; - -print STDERR "Usage : $programme -d répertoire -m métadonnées -s sortie\n\n"; - -exit $code; -} diff --git a/post_traitement_corpus/rename_file.pl b/post_traitement_corpus/rename_file.pl deleted file mode 100644 index e44ee02..0000000 --- a/post_traitement_corpus/rename_file.pl +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use utf8; -use open qw/:std :utf8/; - -use Encode; -use Getopt::Long; - -my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; -$programme = decode_utf8($programme); - -my $source = ""; #par exemple "Vieil_v2.source" -my $dir = ""; #par exemple "Vv2_new" - -eval { - $SIG{__WARN__} = sub {usage(1);}; - GetOptions( - "source=s" => \$source, - "dir=s" => \$dir, - ); - }; -$SIG{__WARN__} = sub {warn $_[0];}; - -usage(2) if not $dir or not $source; - -open(SOURCE, "<:encoding(UTF-8)", $source) - or die "Couldn't open file $source, $!"; - -chomp(my @sourcefile = ); -my $size = @sourcefile; -print STDERR $size; -close SOURCE; - -my %correspondance; -for(my $n=0 ; $n<= $#sourcefile ; $n++ ) -{ - $sourcefile[$n] =~ /(\w{40})\W*?(\w+)/; - $correspondance{$1} = $2; -} - -#print "$_ $correspondance{$_}\n" for (keys %correspondance);#imprimer une table d'hachage - -my $size_cles = keys %correspondance; -print STDERR $size_cles; - -opendir(DIR, $dir) or die "Couldn't open the directory $dir, $!"; -my @files = grep {$_ ne '.' and $_ ne '..'} readdir(DIR); -closedir(DIR); - -foreach my $file (@files){ - my $new = $file; - $new =~ s/(\w{40}).txt/$1/; - if ($correspondance{"$new"}) - { - $new = $correspondance{"$new"}; - rename("$dir/$file","$dir/$new.txt"); - print "$file => $new.txt\n"; - } - else - { - print "Pas de correspondance pour le fichier $file\n"; - } -} - -exit 0; - -sub usage -{ -my $code = shift; - -print STDERR "Usage : $programme -s source -d dir\n"; - -exit $code; -}