#!/usr/bin/env perl
#
# Data conversion script "xin2ntc.pl":
#
# "xin2ntc-new.pl" is a script for format conversion of document data "Xinhua News
# Service" from LDC format to NTCIR corpus format in NTCIR-8(2009).
#
# USAGE:
#
# % perl xin2ntc-new.pl
#
# is supposed to be a directory on which "Xinhua News Service" document
# files in each year of 2002~2005.
# For exmaple, if there are "Xinhua New Service" document files for 2002~2005 on
# a directory "XinHua_news_for_NTCIR8_Moat", you can input as follows:
#
# % perl xin2ntc-new.pl XinHua_news_for_NTCIR8_Moat xin02-05.ntc.txt
#
# UPDATE HISTORY:
# First created for NTCIR-4.
# Customized by Ni Lao on Jan 17, 2008.
# Customized by Wenbo Li on Aug 10, 2009.
######## Mandatory tags ############
# The tag for each document
# Document identifier
# Language code: CH, EN, JA
# Title of this news article
# Text of news article
#Optional tags
# Issue date
#
Paragraph marker
# Section identifier in original newspapers
# Contain figures or not
# Number of words in 2 bytes (for Mainichi Newspaper)
######## target format ############
#
#XIN_CMN_20051201.0001
#CH
#第十届广州国际艺术博览会拉开帷幕
#2005-12-01
#
#
#广州市政府还邀请了驻穗各国领事馆的总领事、领事
#出席本届“艺博会”,以此扩大区域之间、国家之间不同
#文化、艺术的交流。
#
#
#
######## source format ############
#
#
#第十届广州国际艺术博览会拉开帷幕
#
#
#新华社广州12月1日电 (记者 赖少芬)
#
#
#
#广州市政府还邀请了驻穗各国领事馆的总领事、领事
#出席本届“艺博会”,以此扩大区域之间、国家之间不同
#文化、艺术的交流。
#
#
#
my $lang='CH';
$idir = $ARGV[0]; # a directory of Xfi data files
$ofile = $ARGV[1]; # an output file
@sources = (); &ls("\.\/$idir");
#print "hello\n";
open(fo, ">$ofile") || die "cannot create $out";
foreach $fn (sort @sources) {
parse_file($fn);
}
close(fo);
sub parse_file {
my $file =shift;
if ($file !~ /xin_cmn_([0-9][0-9][0-9][0-9])([0-9][0-9])/i){
print "skip file $file\n";
return;
}
my $date, $docno;
print "$file\n";
open(fi, "$file") || die "cannot read $path";
while() {
if (//i) {
#
$docno=$1;
if ($docno !~ /xin_cmn_([0-9]{4})([0-9]{2})([0-9]{2})\.([0-9]{4})/i){
printf "error parsing docno=$docno\n";
}
$date="$1-$2-$3";
print fo "\n";
print fo "$docno\n";
print fo "$lang\n";
next;
}
if (//) {
my $l=; chomp $l; chomp $l;
print fo "$l\n";
;
next;
}
if (//) {
;
print fo "$date\n";
;
next;
}
print fo $_;
}
close(fi);
}
sub ls {
local(@dirs, $dir, $fl);
opendir(DIR, $_[0]);
@dirs = readdir(DIR);
closedir DIR;
foreach $dir (@dirs) {
(-l "$_[0]/$dir") && next;
if ($dir eq "." || $dir eq "..") {
next;
}
# print "$dir\n";
$fl = "$_[0]/$dir";
if ($dir =~ /^xin/) {
push(@sources, $fl);
}
}
}