#!/usr/bin/env perl -- -*-Perl-*-
# バージョン:1.0(UTF-8,NTCIR-8用毎日新聞記事2002年~2005年の
# データの変換に対応)
# UPDATE HISTORY:
# mai2sgml.pl: First created by Satoshi Sekine in 1999.
# mai2ntc.pl: Customized by NTCIR group in 2001.
# mai2ntc-r.pl: Customized by NTCIR group.
# mai2ntc-r-utf.pl: Customized by Yohei Seki and Daisuke Ishikawa
# on Aug 21, 2009.
# jperl mai2ntc-r.pl
# ex. perl mai2ntc-r-utf.pl mai2002a.txt ntc8-mai2002a.txt ntc8-mai2002a.err
#
#use I18N::Japanese;
use utf8;
use open IN=>'utf8';
use Lingua::JA::Regular::Unicode;
$ad{"01"} = "1面";
$ad{"02"} = "2面";
$ad{"03"} = "3面";
$ad{"04"} = "解説";
$ad{"05"} = "社説";
$ad{"07"} = "国際";
$ad{"08"} = "経済";
$ad{"10"} = "特集";
$ad{"12"} = "総合";
$ad{"13"} = "家庭";
$ad{"14"} = "文化";
$ad{"15"} = "読書";
$ad{"16"} = "科学";
$ad{"18"} = "芸能";
$ad{"35"} = "スポーツ";
$ad{"41"} = "社会";
$tag{"C0"} = "DOCNO";
$tag{"AD"} = "SECTION";
$tag{"AE"} = "AE";
$tag{"S1"} = "WORDS";
$tag{"T1"} = "HEADLINE";
$tag{"T2"} = "TEXT";
$newtag{"LANG"} ="LANG";
$lang = "JA";
$newtag{"AF"} = "DATE";
sub zen2han($) {
my $convert = alnum_z2h($_[0]);
$convert;
}
sub transfer($$) {
my ( $key, $context ) = @_;
my $data;
if ( $key eq 'AF' ) {
$data = zen2han( $context );
$hyear = substr($data,0,2);
if ($hyear < 50) {
$date = "20".substr($data, 0, 2)."-".substr($data, 2, 2)."-".substr($data, 4, 2);
}
else {
$date = "19".substr($data, 0, 2)."-".substr($data, 2, 2)."-".substr($data, 4, 2);
}
} elsif ( $key eq 'C0' ) {
$data = zen2han( $context );
# if ($hyear < 50) {
# $data = "20".$data;
# }
# else {
# $data = "19".$data;
# }
# $year = substr($data, 0, 2 );
} elsif ( $key eq 'AE' ) {
$data = ( $context eq 'Y' ) ? '有' : '無' ;
} elsif ( $key eq 'S1' ) {
my $size;
( $size ) = /.*(全(.*)文字)/;
$data = zen2han( $size );
} elsif ( $key eq 'AD' ) {
$data = $ad{zen2han($context)}
} else {
$data = $context;
}
$data;
}
sub output {
my $key;
$txt = join("\n",@{$keyword{'T2'}});
if (($txt =~/著作権交渉中/) && ($txt =~/表示できません/)) {
print ER "\n";
foreach $key ( 'C0' ) {
print ER "<", $tag{$key}, ">", $lang, "-", $keyword{$key}->[0], "", $tag{$key}, ">\n";
}
foreach $key ( 'LANG' ) {
print ER "<", $newtag{$key}, ">", $lang, "", $newtag{$key}, ">\n";
}
foreach $key ( 'AD', 'AE', 'S1', 'T1' ) {
print ER "<", $tag{$key}, ">", $keyword{$key}->[0], "", $tag{$key}, ">\n";
}
foreach $key ( 'AF' ) {
print ER "<", $newtag{$key}, ">", $date, "", $newtag{$key}, ">\n";
}
foreach $key ( 'T2' ) {
print ER "<",$tag{$key},">\n", join("\n",@{$keyword{$key}}), "\n",$tag{$key},">\n";
}
print ER " \n";
print ER "\n";
}
else {
print OUT "\n";
foreach $key ( 'C0' ) {
print OUT "<", $tag{$key}, ">", $lang, "-", $keyword{$key}->[0], "", $tag{$key}, ">\n";
}
foreach $key ( 'LANG' ) {
print OUT "<", $newtag{$key}, ">", $lang, "", $newtag{$key}, ">\n";
}
foreach $key ( 'AD', 'AE', 'S1', 'T1' ) {
print OUT "<", $tag{$key}, ">", $keyword{$key}->[0], "", $tag{$key}, ">\n";
}
foreach $key ( 'AF' ) {
print OUT "<", $newtag{$key}, ">", $date, "", $newtag{$key}, ">\n";
}
foreach $key ( 'T2' ) {
print OUT "<",$tag{$key},">\n", join("\n",@{$keyword{$key}}), "\n",$tag{$key},">\n";
}
print OUT " \n";
}
}
sub ls {
my @dirs;
my @dir;
my $path = $_[0];
opendir(DIR,$path);
@dirs = readdir(DIR);
closedir(DIR);
foreach $dir (@dirs) {
(-l "$path/$dir") && next;
if (($dir eq ".") || ($dir eq "..")) { next; }
if ($dir =~/^.*\.txt$/) {
push(@files,"$path/$dir");
}
}
}
$infile = $ARGV[0];
$ofile = $ARGV[1];
$erfile = $ARGV[2];
if (-d $infile) {
&ls($infile);
}
else {
push(@files,$infile);
}
open(OUT,">$ofile") || die "cannot open $ofile";
open(ER,">$erfile") || die "cannot open $erfile";
binmode(OUT, ":utf8");
binmode(ER, ":utf8");
foreach $file (@files) {
open(IN, "nkf -w $file |") || die "cannot open $infile";
#open(IN, "$file");
$first = 1;
while () {
chomp;
( $tag, $context ) = /\(.*)\(.*)/;
$key = zen2han( $tag );
$data = transfer( $key, $context );
if ( $key eq "ID" ) {
if ( $first == 1 ) {
$first = 0;
} elsif ( $first == 0 ) {
output;
undef %keyword;
$first = -1;
} else {
print OUT "\n";
output;
undef %keyword;
}
}
$keyword{$key} = [] unless $keyword{$key};
push @{$keyword{$key}}, $data;
}
output;
close(IN);
}
close(OUT);
close(ER);