@article{01-NTCIR7-Preface-KandoN, author={Noriko Kando}, title={Preface of NTCIR-7}, year=2008, abstract={} } @article{01-NTCIR7-OV-KandoN, author={Noriko Kando}, title={Overview of the Seventh NTCIR Workshop}, year=2008, abstract={} } @article{01-NTCIR7-INV-LupuM, author={Mihai Lupu and John Tait}, title={The Information Retrieval Facility and Professional Search}, year=2008, abstract={The Information Retrieval Facility (IRF) is an open IR science institution, managed by leading international experts in the field who ensure the highest quality in the research undertaken. The IRF provides a powerful supercomputing infrastructure that is exclusively concerned with semantic processing of text. It gives access a huge collection of documents representing the global archive of ideas and inventions in an environment which allows large-scale scientific experiments on ways to manage and retrieve this knowledge. The talk will provide a brief introduction to these facilities and will overview the ways the IRF aims to support and facilitate excellent research.The IRF is currently involved in two major public evaluation tracks: the TREC Chemistry track and the CLEF IP track. Both rely on the large patent repository made available to scientists, but have different aims and target groups. While the TREC Chemistry track investigates existing IR tools, as applied in a specific domain, the CLEF-IP track looks at the general domain, but in a multilingual environment. They complement the excellent evaluation work done at NTCIR. } } @article{02-NTCIR7-INV-FerroN, author={Nicola Ferro and Carol Peters}, title={From CLEF to TrebleCLEF: the Evolution of the Cross-Language Evaluation Forum}, year=2008, abstract={The Cross Language Evaluation Forum (CLEF) has been running for nearly ten years now the aim of this paper is to provide a critical assessment of the results achieved so far. In the first part of the paper, we provide a brief overview of the entire activity and summarise the main achievements; in the second part, we focus our attention on the Ad Hoc track with the aim of showing how the results of evaluation can be exploited to increase understanding of the many issues involved in multilingual retrieval system development. In the final part, we outline our main ideas for the future of CLEF.} } @article{03-NTCIR7-INV-MajumderP, author={Prasenjit Majumder and Mandar Mitra}, title={What happened in FIRE 2008?}, year=2008, abstract={Inspired by TREC, CLEF and NTCIR, the Forum for Information Retrieval Evaluation (FIRE) aims to create a similar platform for Indian Language Information Retrieval (ILIR). In this talk, we will describe the tasks and results from the FIRE 2008 Workshop, and discuss some of the lessons learned from the experience of trying to construct an evaluation framework for the first time.} } @article{04-NTCIR7-INV-SoboroffI, author={Ian Soboroff}, title={TREC, TAC, takeoffs, tacks, tasks, and titillations for 2009}, year=2008, abstract={This past year, NIST started the Text Analysis Conference (TAC), a new forum combining question-answering, document summarization, and textual entailment. The QA and summarization tasks shared topics and a broadly-defined task with the TREC blog track. In 2009, TREC anticipates a new chemical and patent search track, a new year-long blog collection, and a new billion-page web crawl that four different tracks will focus on.} } @article{01-NTCIR7-OV-CCLQA-MitamuraT, author={Teruko Mitamura and Eric Nyberg and Hideki Shima and Tsuneaki Kato and Tatsunori Mori and Chin-Yew Lin and Ruihua Song and Chuan-Jie Lin and Tetsuya Sakai and Donghong Ji and Noriko Kando}, title={Overview of the NTCIR-7 ACLIA Tasks: Advanced Cross-Lingual Information Access}, year=2008, abstract={This paper presents an overview of the ACLIA (Advanced Cross-Lingual Information Access) task cluster. The task overview includes: a definition of and motivation for the evaluation; a description of the complex question types evaluated; the document sources and exchange formats selected and/or defined; the official metrics used in evaluating participant runs; the tools and process used to develop the official evaluation topics; summary data regarding the runs submitted; and the results of evaluating the submitted runs with the official metrics.} } @article{02-NTCIR7-CCLQA-WuY, author={Youzheng Wu and Wenliang Chen and Hideki Kashioka}, title={NiCT/ATR in NTCIR-7 CCLQA Track: Answering Complex Cross-lingual Questions}, year=2008, abstract={This paper describes our complex cross-lingual question answering (CCLQA) system for NTCIR 2008 ACLIA track. To answer complex questions such as events, biographies, definitions, and relations, we designed two models, i.e., the Centroid-vector model and the SVM-based model. In the official evaluation of the NTCIR 2008 CCLQA track, our SVM-based model achieved 22.11% F-score in the English-Chinese cross-lingual task, the highest score among all participants' systems, and 23.16% F-score in the Chinese-Chinese monolingual task. In the automatic evaluation, the F-scores of the SVM-based model and the Centroid-vector model in the English-Chinese task are 27.24%, and 24.55%, respectively. In the Chinese-Chinese task, the two models achieved 28.30%, and 22.78% F-scores.} } @article{03-NTCIR7-CCLQA-ShimaH, author={Hideki Shima and Ni Lao and Eric Nyberg and Teruko Mitamura}, title={Complex Cross-lingual Question Answering as a Sequential Classification and Multi-Document Summarization Task}, year=2008, abstract={In this paper, we describe the JAVELIN IV: CMU's Question Answering system for cross lingual complex question answering. Our development effort is focused on various linguistic annotations, comparison of various answer extraction algorithms including novel sequential classification approach. We also describe the text summarization view of the answer selection problem. In the NTCIR7 CCLQA main track official evaluation, our system achieved 16.3% and 19.2% accuracy in the English-to-Japanese and English-to-Chinese subtasks, respectively.} } @article{04-NTCIR7-CCLQA-MoriT, author={Tatsunori Mori and Takuya Okubo and Madoka Ishioroshi}, title={A QA System that Can Answer Any Class of Japanese Non-Factoid Questions and its Application to CCLQA EN-JA Task: Yokohama National University at NTCIR-7 ACLIA CCLQA EN-JA}, year=2008, abstract={In this paper, we reported the evaluation results of our CCLQA system at NTCIR-7 ACLIA. We participated in the English-Japanese (EN-JA) cross-lingual task and the Japanese mono-lingual task. The system consists of a question translation module and a non-factoid-type Japanese question-answering system. The question translation module was developed for NTCIR-6 CLQA, which is a combination of an off-the-shelf machine-translation product and a noun phrase translation module using web documents in order to compensate the insufficiencies in the bilingual dictionary of the MT product. With regard to the non-factoid-type Japanese question-answering system, we proposed a method of non-factoid Web question-answering that can uniformly deal with any class of Japanese non-factoid question by using a large number of example Q&A pairs.} } @article{05-NTCIR7-CCLQA-RenH, author={Han Ren and Donghong Ji and Yanxiang He and Chong Teng and Jing Wan}, title={Multi-Strategy Question Answering System for NTCIR-7 C-C Task}, year=2008, abstract={In NTCIR-7, ACLIA task focus on more complex questions than factoid ones. To participate in Monolingual Chinese task, we propose a multi-strategy QA system that can handle questions with different types. Basic steps of this system are described as follow: first, pattern-based question type analysis is performed to classifies the total questions, then an Information Retrieval method is utilized to retrieve and re-rank documents that may contain correct answers; after that, we deal retrieved results with different strategies including heuristic rules, pattern matching and web knowledge bases; finally, nuggets are extracted from answer candidates and formed as the results. The evaluation result shows that our system achieves 43.29% average F-score(beta=3).} } @article{06-NTCIR7-CCLQA-BaiY, author={Yu Bai and Li Guo and Lei Liu and Dongfeng Cai and Bo Zhou}, title={KECIR Question Answering System at NTCIR7 CCLQA}, year=2008, abstract={At the NTCIR-7 CCLQA (Complex Cross-Language Question Answering) task, we participated in the Chinese-Chinese (C-C) and English-Chinese (E-C) QA (Question Answering) subtasks. In this paper, we describe our QA system, which includes modules for question analysis, document retrieval, information extraction and answer generation. Besides, we used an online MT (Machine Translation) system to deal with question translation in our E-C task. An overall analysis and a detailed module-by-module analysis are presented. Since document retrieval is an essential part of CLQA, we also did experiments and submit QA results using IR4QA results in order to find out which IR technique would help CCLQA.} } @article{07-NTCIR7-CCLQA-DengY, author={Yu Deng and Bingjing Xu and Song Liu and Cong Wang}, title={Answer Path at NTCIR-7 CCLQA Track}, year=2008, abstract={This is the first time that our group participate NTCIR and Answer Path is a brand new system. In this system, we have normally three components as Question Analyzer, Passage Retrieval and Answer Extractor. Question Analyzer used the combination methods of rules and Lucene is the choice of our search engine platform. And in Answer Extraction, we cut the retrieved passage into sentences and utilized Wiki resource to sort and evaluate our answers in Biography Question and Definition Question. Other than that, we experimented on clustering method in Event Question and Relationship Question was treated as the combination of several definition questions. Asides from the main components above, we developed Sentence Resemble Model and Answer Filtering and so on. And there were a lot of components in our plan that would be developed in the future.} } @article{08-NTCIR7-CCLQA-HigashinakaR, author={Ryuichiro Higashinaka and Hideki Isozaki}, title={NTT's CCLQA System for NTCIR-7 ACLIA}, year=2008, abstract={This paper describes our Complex Cross-Lingual Question Answering (CCLQA) system based on the technologies used in our past NTCIR systems for QAC and CLQA. We implemented a new rule-based English question analyzer to extract English query terms. The query terms are translated into Japanese by using translation dictionaries. For DEFINITION and BIOGRAPHY questions, we reused our definition module for QAC-4. For RELATIONSHIP questions, we developed a new module based on our why-QA approach for QAC-4. When these modules are not applicable, a simple sentence retriever is called. According to the organizers' evaluation results, although our EN-JA system performed rather poorly because of the low coverage of the translation dictionaries, our JA-JA system achieved the second best score among the four participants.} } @article{09-NTCIR7-CCLQA-LeeY, author={Yi-Hsun Lee and Cheng-Wei Lee and Cheng-Lung Sung and Mon-Tin Tzou and Chih-Chien Wang and Shih-Hung Liu and Cheng-Wei Shih and Pei-Yin Yang and Wen-Lian Hsu}, title={Complex Question Answering with ASQA at NTCIR 7 ACLIA}, year=2008, abstract={For NTCIR 7, we implemented our question answering system ASQA (Academic Sinica Question Answering) for complex questions. We use three methods to select answer strings from News corpus. (a) We use syntactic pattern usually used in QA system to retrieve more precise answer strings than traditional IR. (b) With external knowledge, we can accurately find some specific answers which the traditional IR couldn’t process. (c) Entropy based and co-occurrence based mining methods to retrieve relevant answer strings from document retrieval. In NTCIR 7 CCLQA task, ASQA achieved 0.26 in the CT-CT task and 0.20 in the CS-CS task.} } @article{01-NTCIR7-OV-IR4QA-SakaiT, author={Tetsuya Sakai and Noriko Kando and Chuan-Jie Lin and Teruko Mitamura and Hideki Shima and Donghong Ji and Kuang-Hua Chen and Eric Nyberg}, title={Overview of the NTCIR-7 ACLIA IR4QA Task}, year=2008, abstract={This paper presents an overview of the IR4QA (Information Retrieval for Question Answering) Subtask of the NTCIR-7 ACLIA (Advanced Cross-lingual Information Access) Task Cluster. IR4QA evaluates traditional ranked retrieval of documents using well-studied metrics such as Average Precision, but the retrieval task is embedded in the context of cross-lingual question answering. That is, document retrieval is treated as a component of the entire question answering system. This paper concentrates on how relevance assessments for the Simpified Chinese, Traditional Chinese and Japanese IR4QA test collections were obtained, and the outcome of the formal IR4QA evaluation using the three collections. For the relationship between IR4QA and the entire ACLIA task cluster, we refer the reader to the overview paper of ACLIA. For details of the individual IR4QA systems, we refer the reader to the participants' reports.} } @article{02-NTCIR7-IR4QA-ShiL, author={Lixin Shi and Jian-Yun Nie and Guihong Cao}, title={RALI Experiments in IR4QA at NTCIR-7}, year=2008, abstract={In this report, we examine what information retrieval techniques can be useful for question answering. In particular, for different types of question, we exploit different types of external resource. For example, for biography questions, we extend the original questions by common biography-related words extracted from Wikipedia. Our experiments show that these techniquescan significantly increase retrieval effectiveness.} } @article{03-NTCIR7-IR4QA-HsuC, author={Chih-Chuan Hsu and Yu-Te Li and You-Wei Chen and Shih-Hung Wu}, title={Query Expansion via Link Analysis of Wikipedia for CLIR}, year=2008, abstract={In this paper, we report how we do the query expansion in NTCIR-7 IR4QA subtask. We submit the results total nine runs in this subtask, which are English to traditional Chinese, simplified Chinese and Japanese cross-language information retrieval (CLIR) in the official T-run, D-run and DN-run. In these runs, we use Google online translation service to translate query terms and use the Wikipedia as an information resource of query expansion (QE) in addition to the OKAPI query expansion.} } @article{04-NTCIR7-IR4QA-LuoW, author={Weihua Luo and Tian Xia and Ji Guo and Qun Liu}, title={ICT-Crossn: The System of Cross-lingual Information Retrieval of ICT in NTCIR-7}, year=2008, abstract={IR4QA is a new task in NTCIR-7, which intends to evaluate which IR techniques are more helpful to a QA system. This paper describes in detail the implementation of our IR4QA system, ICT-Crossn. The system consists of a query translation component that integrates the methods of phrase based statistical machine translation and OOV translation methods based on search engine, and a document retrieval component which combines outputs of multiple IR models with a linear model. We tune the parameters on the development set constructed on the dry run set. The official evaluation results show our method achieves a good performance.} } @article{05-NTCIR7-IR4QA-LaoN, author={Ni Lao and Hideki Shima and Teruko Mitamura and Eric Nyberg}, title={Query Expansion and Machine Translation for Robust Cross-Lingual Information Retrieval}, year=2008, abstract={In this paper, we describe the Information Retrieval subsystem of JAVELIN III, a question-answering system that answers complex questions from multilingual sources. Our research focus is on different strategies for query term extraction, translation, filtering, expansion and weighting, including a novel alias expansion technique using lexico-semantic patterns learned with weakly-supervised algorithm. In the NTCIR7 IR4QA evaluation, our retrieval system achieved 59% and 59% MAP in the Chinese-to-Chinese and Japanese-to-Japanese subtasks, respectively. We provide a rationale for the retrieval system design, and present a detailed error analysis for our formal run results.} } @article{06-NTCIR7-IR4QA-CaiD, author={Dongfeng Cai and Dongyuan Li and Yu Bai and Bo Zhou}, title={KECIR Information Retrieval System for NTCIR7 IR4QA Task}, year=2008, abstract={The KECIR group participated in the IR4QA (Information Retrieval for Question Answering) task of the NTCIR-7 ACLIA (Advanced Cross-lingual Information Access) Task Cluster. In this paper, we describe our approach on the Simplified Chinese (CS) document collections of ACLIA for the Information Retrieval for Question Answering (IR4QA) task. The purposes of our research are: (a) To examine the effect of different query generation mechanism on retrieval performance; (b) To evaluate the hybrid model whether or not more efficient than unique model. This year we compared three different approaches of standard query expansion, including Local Context Analysis (LCA) method (KECIR-CS-CS-01-T), Relevance feedback method (KECIR-CS-CS-02-DN) and using online encyclopedia method (KECIR-CS-CS-03-DN). Some famous information retrieval models, i.e., Vector Space Model, Language Model were also adopted in our study for ranking relevant documents.} } @article{07-NTCIR7-IR4QA-ChenH, author={I-Chien Liu and Lun-Wei Ku and Kuang-Hua Chen and Hsin-Hsi Chen}, title={NTUBROWS System for NTCIR-7 Information Retrieval for Question Answering}, year=2008, abstract={This paper presents our information retrieval system for the NTCIR-7 information retrieval for question answering task. Our system is com-posed by three parts: (1) Query processing per-forms stop-word filtering and chooses a set of terms as required set. (2) Retrieval model ap-plies three retrieval models and limits the result based on the required set acquired from pre-vious stage. (3) Re-rank module gives documents scores by their term distribution. When evaluat-ing on the NTCIR-7 test set, the performance of our system achieves a mean average precision (MAP) of 0.4635, a Q-measure of 0.4811, and an MSn-DCG of 0.6831 on NTCIR-7 IR4QA testing set.} } @article{08-NTCIR7-IR4QA-HeX, author={Xiaoning He and Peidong Wang and Haoliang Qi and Muyun Yang and Guohua Lei and Yong Xue}, title={Using Google Translation in Cross-Lingual Information Retrieval}, year=2008, abstract={HIT2 Lab participated in NTCIR 7 IR4QA task. In this task many topics are consisted of name entities, so Google translation was used to translate query terms because of its high performance on name entity translation. We use KL-divergence model to perform retrieval and Chinese character bigram as our indexing unit. Pseudo feedback was used trying to improve average precision. We achieved competitive results in the task.} } @article{09-NTCIR7-IR4QA-HyodoT, author={Tatsuhiro Hyodo and Tomoyosi Akiba}, title={Statistical Machine Translation based Passage Retrieval --- Experiment at NTCIR-7 IR4QA Task}, year=2008, abstract={In this paper, we apply the statistical machine translation based passage retrieval, which was proposed at the last NTCIR-6 CLQA subtask, to the IR4QA Task. The experimental evaluation shows that the method is more effective for the relation and event type questions, which are longer and including relatively mane common keywords, than the definition and biography type questions, which are shorter and often including only named entities. } } @article{10-NTCIR7-IR4QA-LarsonR, author={Ray Larson and Fredric Gey}, title={High Baseline Japanese Information Retrieval for Question-Answering}, year=2008, abstract={For NTCIR Workshop 7 UC Berkeley participated in IR4QA (Information Retrieval for Question Answering) as well as the Patent Mining tracks. For IR4QA we only did Japanese monolingual search. Our focus was thus upon Japanese topic search against the Japanese News document collection as in past NTCIR participations. We preprocessed the text using the ChaSen morphological analyzer for term segmentation. We utilized a time-tested logistic regression algorithm for document ranking coupled with blind feedback. The results were satisfactory, ranking second among IR4QA overall submissions.} } @article{11-NTCIR7-IR4QA-LiuM, author={Maofu Liu and Fang Fang and Qing Hu and Jianxun Chen}, title={Question Analysis and Query Expansion in CS-CS IR4QA}, year=2008, abstract={This paper describes our work in NTCIR-7 on the subtask of simplified Chinese monolingual information retrieval for question answer (CS-CS IR4QA). Based on the observation that inappropriate key terms and term mismatch often result in depressed precision and impressive recall, we employ a special question analysis method extracting more appropriate key terms and apply the query expansion technique gaining more relevant key terms, to enhance precision and efficiency for retrieval performance.} } @article{12-NTCIR7-IR4QA-TengC, author={Chong Teng and Yanxiang He and Donghong Ji and Han Ren and Lingpeng Yang and Wei Xiong}, title={Information Retrieval Using PU Learning Based Re-ranking}, year=2008, abstract={In this paper, we describe our approach for information retrieval for question answering (IR4QA) on simple Chinese language of NTCIR-7 tasks. Firstly, we use both bi-grams and single Chinese characters as index units and use OKAPI BM25 as retrieval model. Secondly, we re-rank all documents’ orders for the first retrieval documents. We focus mostly on the document re-ranking technique. We address probabilistically labeling relevant degree between the first retrieval documents and query topics. In other words, we want to know the probability of a document belongs to relevance/irrelevance class. We employ PU learning to solve this problem, and use Bayesian classifier and EM algorithm in process of computing the probability. Consequently, those relevant documents with high probability are updated rank. Lastly, we use re-ranked retrieved documents to do query expansion. Evaluation at NTCIR-7 shows that our group achieves 0.3862 and 0.3806 MAP based on pseudo-qrels and real qrels respectively.} } @article{13-NTCIR7-IR4QA-TomlinsonS, author={Stephen Tomlinson}, title={Experiments in Finding Chinese and Japanese Answer Documents at NTCIR-7}, year=2008, abstract={We describe evaluation experiments conducted by submitting retrieval runs for the natural language Simplified Chinese, Traditional Chinese and Japanese questions of the Information Retrieval for Question Answering (IR4QA) Subtask of the Advanced Cross-lingual Information Access (ACLIA) Task Cluster of the 7th NII Test Collection for IR Systems Workshop (NTCIR-7). In a sampling experiment, we found that, on average per topic, the percentage of answer documents assessed was less than 65% for Simplified Chinese, 32% for Traditional Chinese and 41% for Japanese. However, our preferred measure for this task, Generalized Success@10, only considers the rank of the first answer document retrieved for each topic, as one good document answering the question is all that one really needs for this task. We experimented with different techniques (words vs. n-grams, removing question words and blind feedback) and found that the choice of technique can have a substantial impact on the rank of the first answer document for particular questions.} } @article{01-NTCIR-OV-MOAT-SekiY, author={Yohei Seki and David Kirk Evans and Lun-Wei Ku and Le Sun and Hsin-Hsi Chen and Noriko Kando}, title={Overview of Multilingual Opinion Analysis Task at NTCIR-7}, year=2008, abstract={This paper describes an overview of the Multilingual Opinion Analysis Task from 2007 to 2008 at the Seventh NTCIR Workshop. We created test collections of 22, 17, 17, 16 topics (7,163, 4,711, 6,174, and 5,301 sentences) in Japanese, English, Traditional Chinese, and Simplified Chinese. Using this test collection, we conducted five subtasks: (1) mandatory opinionated sentence judgment, and optional subtasks of (2) relevant sentence judgment, (3) polarity judgment, (4) opinion holder extraction, and (5) opinion target extraction. 32 results were submitted from 21 participants with five results submitted by the organizers. In this paper we present the task definition, the details of the test collection, the evaluation results of the groups that participated in this task, and their approach.} } @article{02-NTCIR7-MOAT-ZagibalovT, author={Taras Zagibalov and John Carroll}, title={Almost-Unsupervised Cross-Language Opinion Analysis at NTCIR-7}, year=2008, abstract={We describe the Sussex NLCL System entered in the NTCIR-7 Multilingual Opinion Analysis Task(MOAT). Our main focus is on the problem of portability of natural language processing systems across languages. Our system was the only one entered for all four of the MOAT languages, Japanese, English, and Simplified and Traditional Chinese. The system uses an almost-unsupervised approach applied to two of the sub-tasks: opinionated sentence detection and topic relevance detecion.} } @article{03-NTCIR7-MOAT-QuL, author={Lizhen Qu and Cigdem Toprak and Niklas Jakob and Iryna Gurevych}, title={Sentence Level Subjectivity and Sentiment Analysis Experiments in NTCIR-7 MOAT Challenge}, year=2008, abstract={This paper describes our supervised approach to the opinionated and the polarity subtasks in the NTCIR-7 MOAT Challenge. We apply a sequential tagging approach at the token level and use the learned token labels in the sentence-level classification tasks. In our formal run submissions, we utilized SVMhmm in both tasks with syntactic and lexicon-based features. Additionally, we present our experiments with structural correspondence learning (SCL) for addressing the domain adaptation problem in sentiment analysis. We report experiments on three corpora: MPQA, NTCIR-6 and NTCIR-7, however our formal run submission is trained on MPQA. We reached an F-measure of 0.48 in the opinionated (lenient) and 0.27 (lenient) in the polarity subtasks.} } @article{04-NTCIR7-MOAT-LuB, author={Bin Lu and Benjamin K. Tsou and Oi Yee Kwong}, title={Supervised Approaches and Ensemble Techniques for Chinese Opinion Analysis at NTCIR-7}, year=2008, abstract={For the opinion analysis task on traditional Chinese texts at NTCIR-7, supervised approaches and ensemble techniques have been used and compared in our participating system. Two kinds of supervised approaches were employed here: 1) the supervised lexicon-based approach, 2) machine learning approaches, and ensemble techniques were also used to combine the results given by different approaches. By making use of these approaches and ensemble methods in various combinations, we submitted three runs for each of the two subtasks we participated in: opinionated sentence recognition and opinion polarity classification. The results show that our system achieved state-of-the-art performances on both subtasks: the highest F-measure on the opinionated sentence recognition task and the second highest F-measure on the opinion polarity classification task amongst all runs submitted by seven participants. Furthermore, The combination of different classifiers markedly outperformed individual classifiers on the opinion polarity classification task, while did not show much improvement, if no hurt, on the opinionated sentence recognition task.} } @article{05-NTCIR7-MOAT-LiuK, author={Kang Liu and Jun Zhao}, title={NLPR at Multilingual Opinion Analysis Task in NTCIR7}, year=2008, abstract={This paper presents our work in the simplified Chinese opinion analysis task in NTCIR7. For identifying the subjective sentences, the domain adaptation technique was applied in our method, so that the data in NTCIR6 can be used for training subjective classifier. The evaluation results proves that method proposed in this paper is effective. In extracting the opinion holder, we used the CRF model, which was combined with manual designed heuristics rules. For CRF model we not only extracted part-of-speech features, semantic class features, contextual features, but also some dependency features through parsing analysis. The evaluation results prove that the proposed method is effective for extracting opinion holders.} } @article{06-NTCIR7-MOAT-HuangY, author={Yunping Huang and Yulin Wang and Le Sun}, title={ISCAS at Multilingual Opinion Analysis Task}, year=2008, abstract={The paper presents our work in the multilingual opinion analysis task in NTCIR7 in Simplified Chinese. In detecting opinionated sentences, an EM algorithm was proposed to extract the sentiment words based on the sentimental dictionary, and an iterative algorithm was used to estimate the score of the sentiment words and the sentences. In detecting relevant sentences, we solve this problem by analogizing the task to the traditional information retrieval task. The difficulty lies in that some sentence is relevant to the topic even if there are no key words hit in it. In this situation, we use an implicit feedback and query extension method to refine the result. The evaluation results and the result analysis will also be presented.} } @article{07-NTCIR7-MOAT-KarlgrenJ, author={Jussi Karlgren and Gunnar Eriksson and Oscar Täckström}, title={SICS at NTCIR-7 MOAT: Constructions Represented in Parallel with Lexical Items}, year=2008, abstract={This paper describes experiments to find attitudinal expressions in written English text. The experiments are based on an analysis of text with respect to not only the vocabulary of content terms present in it (which most other approaches use as a basis for analysis) but also on structural features of the text as represented by presence of function words (in other approaches often removed by stop lists) and by presence of constructional features (typically disregarded by most other analyses). In our analysis, following a constructional grammatical framework, structural features are treated similarly to vocabulary features. Our result gives us reason to conclude - provisionally, until more empirical verification experiments can be performed - that: * Linguistic structural information does help in establishing whether a sentence is opinionated or not; whereas * Linguistic information of this specific type does not help in distinguishing sentences of differing polarity.} } @article{08-NTCIR7-MOAT-KimJ, author={Jungi Kim and Hun-Young Jung and Sang-Hyeob Nam and Yeha Lee and Jong-Hyeok Lee}, title={English Opinion Analysis for NTCIR7 at POSTECH}, year=2008, abstract={We describe an opinion analysis system developed for Multilingual Opinion Analysis Task at NTCIR7. Given a topic and relevant newspaper articles, our system determines whether a sentence in the articles carries an opinion, if so, then extract the polarity and holder of the opinion. Our system exploits subjectivity lexicon to score the sentiment of words that consist of a sentence, in addition with a weight that reflects importance and discriminating power of words, commonly dealt in information retrieval tasks. Our system achieves high performance overall, with exceptional performance on polarity judgment of sentences.} } @article{09-NTCIR7-MOAT-KimY, author={Youngho Kim and Seongchan Kim and Sung-Hyon Myaeng}, title={Extracting Topic-related Opinions and their Targets in NTCIR-7}, year=2008, abstract={In recent years, there have been many interests in opinion resources such as online news, blogs, and forums. With this tendency, many opinion-related applications are proposed to quench an opinion-seeking desire of users. However, selecting publicly interesting opinions is important to improve the effectiveness of such opinion applications, by focusing more on important opinions. To achieve this goal, we propose an opinion mining system which extracts topic-related opinions (at sentence level) and identifies their targets. Our system can be characterized with probabilistic divergence based keywords extraction, language model based topic relevance determination with web-snippets expansion, and heuristic feature based target identification. Experimental results show that our approach is promising. } } @article{10-NTCIR7-MOAT-KobayashiD, author={Daisuke Kobayashi and Hidetsugu Nanba and Toshiyuki Takezawa}, title={Extraction of Opinion Sentences using Machine Learning: Hiroshima City University at NTCIR-7 MOAT}, year=2008, abstract={We propose a machine learning-based method for extracting opinion sentences using 13 features including about 760,000 of sentence-final expressions. We submitted two systems to the Japanese Subtask of the MOAT at NTCIR-7 Workshop, and obtained F-values of 0.5615 and 0.3319 using lenient gold standard, and 0.5213 and 0.3561 using strict gold standard, respectively.} } @article{11-NTCIR7-MOAT-KuL, author={Lun-Wei Ku and I-Chien Liu and Chia-Ying Lee and Kuan-hua Chen and Hsin-Hsi Chen}, title={Sentence-Level Opinion Analysis by CopeOpi in NTCIR-7}, year=2008, abstract={In this paper, we introduce our system, CopeOpi, for analyzing opinionated information in NTCIR-7 MOAT task's document collections. We participated in all tasks except opinion target extraction and submitted three runs for both simplified and traditional Chinese sides. For opinion extraction task, our algorithm was based on the bag-of-character methods proposed in NTCIR-6 and considered morphological structures of Chinese words to extract opinion words more correctly. How distant an opinion word is to the end of the sentence is also considered to adjust its opinion weight. The performance of the opinion extraction, which is the second best of all participants, achieves the f-measure 0.672 under the lenient metric and 0.783 under the strict metric. The performances of polarity detection and the relevance judgment are both ranked the third.} } @article{12-NTCIR7-MOAT-MengX, author={Xinfan Meng and Houfeng Wang}, title={Detecting Opinionated Sentences by Extracting Context Information}, year=2008, abstract={In this paper, we briefly describe several experimental methods to solve MOAT at NTCIR-7. In the subtask of opinionated sentence detection, two methods aiming to extract the context information of each sentence are proposed. Maximum Entropy model is used to predict the polarity class. A rule-based pattern matching scheme is devised to find topic-relevant sentence. For the subtask of detecting holders and targets, the CRF model is adopted.} } @article{13-NTCIR7-MOAT-MizuguchiH, author={Hironori Mizuguchi and Masaaki Tsuchida and Kenji Tateishi and Dai Kusui}, title={Opinion Sentence and Topic Relevant Sentence Extraction by Using Coherent Structure among the Sentences}, year=2008, abstract={We developed a new sentence extraction framework, the Sliding Window Framework, by using coherent structure among the sentences. Coherent structure means that the sentences that relate to a certain topic in an article are written in clusters to preserve the logical organization. To use the structure, our method makes blocks that consist of sentences in a window of a certain size, then estimates the score of each block, and judges each sentence from the scores. We applied our framework to opinion sentence extraction and topic relevant sentence extraction. In the result of our experiments, our framework achieved a very high recall ratio and a high F-value.} } @article{14-NTCIR7-MOAT-OzawaH, author={Hiroya Ozawa and Hiroya Susuki and Hiroaki Saito}, title={An Opinion Detection and Classification System Using Support Vector Machines}, year=2008, abstract={We developed an opinion detection and polarity clasification system for Japanese newspapers at NTCIR-7 MOAT task. Our system detects sentences which are "opinionated" or "not opinionated" and classifies these into "positive", "negative" or "neutral". We used Support Vector Machines (SVM) as a machine learning method. To determine features, we focused on the end of expression, particular structure of opinionated sentences, and continuity of opinion. In the formal run, the opinion detection accuracy was 81.15%, recall 34.16%, F-measure 48.08, and the polarity classification attained accuracy 48.05%, recall 18.01%, F-measure 26.20.} } @article{15-NTCIR7-MOAT-SekiY, author={Yohei Seki}, title={A Multilingual Polarity Classification Method using Multi-label Classification Technique Based on Corpus Analysis}, year=2008, abstract={In NTCIR-7 MOAT, we participated in four subtasks (opinion & holder detection, relevance judgement, and polarity classification) at two language sides: Japanese and English. In this paper, we focused on the feature selection and polarity classification methodology in both languages. To detect opinion and classify the polarity, the features were selected based on a statistical χ-square test over NTCIR-6 and MPQA corpora. We also compared several multi-label classification methods to classify positive, negative, and neutral polarity. The evaluation results suggested that the coverage of the feature in Japanese was acceptable for the opinion analysis in newspaper articles, but there was still a room for improvement in the coverage of the reatures in English. We also found the result of SVM voting approach was slightly better than the results of Multi-label classification apprach. } } @article{16-NTCIR7-MOAT-TakedaT, author={Takaharu Takeda}, title={Opinion Analysis Syetem for NTCIR-7}, year=2008, abstract={We present our opinion analysis system for Japanese that was used in the Opinion Analysis Pilot Task at NTCIR-7. Our Purpose is to evaluate and validate the methods, then offer new insight into this area.} } @article{17-NTCIR7-MOAT-Villena-RomanJ, author={Julio Villena-Roman and Sara Lana-Serrano and José C. González-Cristóbal}, title={MIRACLE at NTCIR-7 MOAT: First Experiments on Multilingual Opinion Analysis}, year=2008, abstract={This paper describes the participation of MIRACLE research consortium at NTCIR-7 Multilingual Opinion Analysis Task, our first attempt on sentiment analysis and second on East Asian languages. We took part in the main mandatory opinionated sentence judgment subtask (to decide whether each sentence expresses an opinion or not) and the optional relevance and polarity judgment subtasks (to decide whether a given sentence is relevant to the given topic and also the polarity of the expressed opinion). Our approach combines a semantic language-dependent tagging of the terms of the sentence and the topic and three different ad-hoc classifiers that provide the specific annotation for each subtask, run in cascade. The models have been trained with the corpus provided in NTCIR-6 Opinion Analysis pilot task. } } @article{18-NTCIR7-MOAT-WuY, author={Yu-Chieh Wu and Li-Wei Yang and Jeng-Yan Shen and Linag-Yu Chen and Shih-Tung Wu}, title={Tornado in Multilingual Opinion Analysis: A Transductive Learning Approach for Chinese Sentimental Polarity Recognition}, year=2008, abstract={In this paper, we present our statistical-based opinion analysis system for NTCIR-MOAT track this year. Our method involves two different approaches: (1) the machine learning-based prototype system (on the basis of support vector machines (SVMs)) and (2) stochastic estimation of the character-level of words. The former were the real applications of state-of-the-art machine learning algorithms, while the latter comprises of ad-hoc opinioned word, phrase analysis. We submitted both two runs to NTCIR-MOAT in this year. The prototype system was first designed for traditional Chinese. We also directly port it to Simplified Chinese text with dictionary-based word translation. To make the model more robust, we present the idea of transdutive learning to our models. The main advantage of this approach is that it learns the hypothesis from labeled data meanwhile adapt to the large unlabeled data. Our method could not only be applied to SVM-based approaches, but also is applicable with the other nonmachine learning algorithms. The experimental results showed that our method (approach 1) can effectively identify the opinioned sentences in 0.661 and 0.611 f-measure rates under the lenient test. In terms of polarity judgment, our method achieves 0.284 and 0.294 in F-measure rates of the proposed two approaches, respectively. In the relevant sentence judgment track, our group achieved the best and the second best results among all other participants. Owing to the lack of labeled training data, we trust that our method could be further enhanced by feeding with more consistent and large annotated corpus.} } @article{19-NTCIR-MOAT-XuR, author={Ruifeng Xu and Kam-Fai Wong and Yunqing Xia}, title={Coarse-Fine Opinion Mining - WIA in NTCIR-7 MOAT Task}, year=2008, abstract={This paper presents an opinion analysis system developed by CUHK_PolyU_Tsinghua Web Information Analysis Group (WIA), namely WIA-Opinmine, for NTCIR-7 MOAT Task. Different from most existing opinion mining systems, which recognize opinionated sentences as one-step classification procedure, WIAOpinmine adopts a multi-pass coarse-fine analysis strategy. A base classifier firstly coarsely estimates the opinion of sentences and the document. The obtained document-level and sentence-level opinions are then incorporated in a complex classifier to re-analyze the opinion of sentences to obtain refined sentence and document opinions. The updated opinion features are feed back to the complex classifier to further refine the opinion analysis. Such circles terminate until the analysis results converge. Similar strategy is adopted in sentence-topic relevance estimation. Furthermore, the mutual reinforcement between the analysis of sentence relevance and sentence opinion are integrated in one framework in WIA-Opinmine. Evaluations on NTCIR-7 MOAT Traditional Chinese and Simplified Chinese sides show that WIA-Opinmine achieves the best precisions performance in five subtasks and the best F performance in three subtasks including polarity determination, opinion holder recognition and opinion target recognition. This results show that the proposed framework integrating coarse-fine opinion mining strategy and the mutual reinforcement between the analysis of sentence relevance and sentence opinion is promising.} } @article{20-NTCIR7-MOAT-ZhangC, author={Chunliang Zhang and Ke Wang and Muhua Zhu and Tong Xiao and Jingbo Zhu}, title={NEUOM: Identifying Opinionated Sentences in Chinese and English Text}, year=2008, abstract={This paper introduces our NEUOM system which participates in the opinionated sentence detection task, one of evaluation tasks in Multilingual Opinion Analysis Task (MOAT) of NTCIR-7. NEUOM system adopts a sentiment lexicon-based(SLB) approach to identifying opinionated sentences in a Chinese text and English text. For English task, a machine learning algorithm, naïve Bayesian classification model, is also tried with the use of the English training corpora, such as MPQA and NTCIR-6 data set. Experimental results show that in the English task SLB method achieved better F1 performance than Naïve Bayesian model.} } @article{21-NTCIR7-MOAT-ZubaryevaO, author={Olena Zubaryeva and Jacques Savoy}, title={Opinion and Polarity Detection within Far-East Languages in NTCIR-7}, year=2008, abstract={This paper presents our work in the Multilingual Opinion Analysis Task (MOAT) done during the NTCIR-7 evaluation campaign. This is our first participation in this kind of retrieval and classification task in which we participated for the English, Japanese and traditional Chinese language. As a basic model we suggested a probabilistic model derived from Muller's method [1] that allows us to determine and weight terms (isolated words, bigram of words, noun phrases, etc.) belonging to a given category compared to the rest of the corpus. In the current task, the classification categories are positive, negative, neutral and not opinionated. To succeed at this classification task, we have adopted the logistic regression method in order to define the most probable category for each input sentence. Our participation was strongly motivated by the objective to suggest an approach on the polarity subtask of the MOAT with a minimal linguistic component.} } @article{01-NTCIR7-OV-PATMN-NanbaH, author={Hidetsugu Nanba and Atsushi Fujii and Makoto Iwayama and Taiichi Hashimoto}, title={Overview of the Patent Mining Task at the NTCIR-7 Workshop}, year=2008, abstract={This paper introduces the Patent Mining Task of the Seventh NTCIR Workshop and the test collections produced in this task. The task’s goal was the classification of research papers written in either Japanese or English in terms of the International Patent Classification (IPC) system, which is a global standard. For this task, 12 participant groups submitted 49 runs. In this paper, we also report the evaluation results of the task.} } @article{02-NTCIR7-PATMN-XiaoT, author={Tong Xiao and Feifei Cao and Tianning Li and Guolong Song and Ke Zhou and Jingbo Zhu and Huizhen Wang}, title={KNN and Re-ranking Models for English Patent Mining at NTCIR-7}, year=2008, abstract={This paper describes our patent mining system for NTCIR-7 competition which maps a research paper abstract into IPC taxonomy. Our system is basically under the k-Nearest Neighboring framework, in which various similarity calculation and ranking methods are used. We employ two re-ranking techniques to improve the performance by the use of richer features. Our systems performed well on the NTCIR-7 patent mining task (English sub-task) and obtained the best MAP-measure among all the participations.} } @article{03-NTCIR7-PATMN-BianG, author={Guo-Wei Bian and Shun-Yuan Teng}, title={Integrating Query Translation and Text Classification in a Cross-Language Patent Access System}, year=2008, abstract={In this paper, a cross-language patent retrieval and classification system is presented to integrate the query translation using various free web translators on the internet and the document classification. The language-independent indexing method was used to process the multilingual patent documents, and the query translation method was used to translate the query form the source language to the target language. The mono-lingual and cross-lingual retrieved patent documents would be processed to classify the research papers (the queries) in terms of the International Patent Classification (IPC). The results indicate that the performance of the cross-lingual text classification reached almost the same level of the mono-lingual text classification.} } @article{04-NTCIR7-PATMN-CaoG, author={Guihong Cao and Jian-Yun Nie and Lixin Shi}, title={NTCIR-7 Patent Mining Experiments at RALI}, year=2008, abstract={We participated in the patent mining task at NTCIR7 workshop. Particularly, our experiments focus on English corpus. Based on the Indri search engine, we implemented a patent classification system, which is able to assign a research paper into the IPC system according to the annotated patents in the database. As the task is a cross-genre classification task, we tried several methods to bridge the gap between the research papers and patents. Unfortunately, most the methods do not produce consistent improvements.} } @article{05-NTCIR7-PATMN-ClinchantS, author={Stephane Clinchant and Jean-Michel Renders}, title={XRCE's Participation to Patent Mining Task at NTCIR-7}, year=2008, abstract={In this first participation to the NTCIR - Patent Mining Task, our goal was to assess very simple large-scale categorization methods, especially in a cross-lingual framework. Our categorizers are instances of the "k-nearest neighbors" classifier. We used the Language Modelling approach to Information Retrieval as the building block to define similarity measures when computing the nearest neighbors. We also adopted a particular fusion scheme when building a class assignment function from the labels of nearest neighbors, that appears to be particularly efficient. As bilingual resources, we used simply a parallel part of the NTCIR-1 corpus, from which we extracted a bilingual lexicon. Even if this could look like a very crude approach, this bilingual lexicon, when integrated in a cross-lingual similarity measure, gave performance that is exactly at the same level as the monolingual case.} } @article{06-NTCIR7-PATMN-FujinoA, author={Akinori Fujino and Hideki Isozaki}, title={Multi-label Classification using Logistic Regression Models for NTCIR-7 Patent Mining Task}, year=2008, abstract={We design a multi-label classification system based on a machine learning approach for NTCIR-7 Patent Mining Task. In our system, we employ a logistic regression model for each International Patent Classification (IPC) code that determines the assignment of research papers to the IPC code. The logistic regression models are trained by using patent documents given by task organizers. To mitigate the overfitting of the logistic regression models to the patent documents, we design the features of the patent documents by utilizing the distribution of vocabulary words included in research papers. Using a test collection of the Japanese subtask, we confirmed the performance of our multi-label classification system.} } @article{07-NTCIR7-PATMN-GeyF, author={Fredric Gey and Ray Larson}, title={Patent Mining: A Baseline Approach}, year=2008, abstract={For NTCIR Workshop 7 UC Berkeley participated in both IR4QA and the Patent Mining Tasks. This paper summarizes our approach to Patent Mining. Our focus was upon the US Patent collection, and our methodology was to treat patent mining as an information retrieval task and to aggregate multiple patent classifications from retrieved patent documents. The performance was relatively poor, possibly because of retrieving too many documents, or because of non-utilization of blind feedback techniques.} } @article{08-NTCIR7-PATMN-JiD, author={Duo Ji and Huan-yu Zhao and Dong-feng Cai}, title={Using the Multi-level Classification Method in the Patent Mining Task at NTCIR-7}, year=2008, abstract={A patent includes a great amount of practical technical information, and is an important literature to push scientific progress. The research on its classification has significant application value. The patent is a special technique text with strict hierarchical classification system and normalized structure content, so that a great amount of association exists between components and patents. Based on these associations, this paper proposes a term weight calculation method using the patent title, and adopts a hierarchical classification method to complete the classification tasks of English and Japanese in the NTCIR-7 patent mining. The validity of the method is proved by corresponding evaluation.} } @article{09-NTCIR7-PATMN-MaseH, author={Hisao Mase and Makoto Iwayama}, title={NTCIR-7 Patent Mining Experiments at Hitachi}, year=2008, abstract={This paper reports results of our experiments on the automatic assignment of patent classification to research paper abstracts. We applied K-Nearest Neighbors Methods and three kinds of query term expansion methods using a research paper abstract dataset and a patent document dataset to improve the classification accuracy. The results show that these query expansion methods slightly improve classification accuracy when the parameter is tuned appropriately. We also compared the classification accuracy when research paper abstracts are used as input with that when abstracts or full texts of patent documents are used as input.} } @article{10-NTCIR-PATMN-NanbaH, author={Hidetsugu Nanba}, title={Hiroshima City University at NTCIR-7 Patent Mining Task}, year=2008, abstract={Our group participated in the Patent Mining Task of the NTCIR-7. We constructed three systems "HCU1", "HCU2", and "HCU3". "HCU1" is based on the k-Nearest Neighbour method using an IR system developed for NTCIR-6 Patent retrieval task. ''HCU2' is the modified version of "HCU1" using analysis method of the structures of titles. "HCU3" uses auto-matically created lists of technical terms for each IPC code. We submitted these systems to the Japanese subtask, and obtained 39.13, 39.06, and 14.12 of the MAP scores, respectively.} } @article{11-NTCIR7-PATMN-SeddiquiM, author={Md. Hanif Seddiqui and Yohei Seki and Masaki Aono}, title={Ontology based Approach to Patent Mining for Relating International Patent Classification (IPC) to a Scientific Abstract}, year=2008, abstract={Identifying research gap and predicting research trend is a formidable task in the field of patent mining. The primary step to accomplish the task is to relate International Patent Classification (IPC) to a research paper abstract. Naively relating IPC to a scientific paper abstract is not an easy task due to the generality of terms available in an abstract, the massiveness of the patent documents and the availability of innovative new field specific technical terminologies. Our research proposes an efficient ontology approach to patent mining that retrieves IPC related to a scientific abstract by combining the data and the methodologies used in the field of ontology. The data contains an ontology of IPC and terms to IPC mapping. First, the system uses the extracted terms to retrieve probable IPCs from the terms to IPC mapping. We consider each of the probable IPCs as an anchor point in IPC ontology for further analysis. Our system starts aligning terms available in abstract to the hierarchy of the ontology of IPC to detect correct IPCs and to remove irrelevant one. Our system has a salient feature of efficient computation to relate IPC to scientific paper abstract. The way of using IPC ontology in retrieving related IPC is a novel process} } @article{12-NTCIR7-PATMN-ShimanoT, author={Takanori Shimano and Takashi Yukawa}, title={An Automated Research Paper Classification Method for the IPC system with the Concept Base}, year=2008, abstract={In the present paper, a classification method using the Concept Base is proposed and evaluated in the Patent Mining Task of the NTCIR-7 workshop. In this task, research papers are classified into the IPC system. The classification enables to locate the research papers on a patent map. To classify the paper, the authors take an approach that retrieves patent documents which are similar to the paper and classifies the paper with the class of the patent documents. The approach can classify the research paper into the correct classes when the appropriate patent documents are retrieved with the paper. However, there is an issue that patent documents differ from research papers in characteristic of document even if the technical idea of a patent is the same as a research papers. For example, the terms and document structure used in the patent differ from the paper. Therefore, the classification method in this task requires the approach that solve the issue caused by their document characteristic. To clarify that the performance of the naive method of this approach, the authors classified research papers with a simple classification method using the VSM. Then to solve the issue caused by the term difference, a classification method using the CBVSM is proposed. The degree of similarity in this method reflects the semantics of the words. The results showed that the Mean Average precision value of the proposed method did not exceed that of the baseline method. In fact, our method is positioned in the worst group among the participating teams. However, the proposed method provides better average precision values than the baseline method for 33% of topics. From this results, it is suggested that more effective method would be able to constructed with combination of the baseline method and the method using CBVSM.} } @article{13-NTCIR7-PATMN-WangW, author={Wei Wang and Sujian Li and Chen Wang}, title={ICL at NTCIR-7: An Improved KNN Algorithm for Text Categorization}, year=2008, abstract={This paper describes our system for the NTCIR 07 Patent Mining Task which sought to make automatic text classification pragmatic. Our system employs an improved KNN algorithm which makes trade-off between effectiveness and time complexity. We have tried two distance metrics in our algorithm: cosine similarity and Euclid distance. Evaluation results on NTCIR07 test data shows that the former one is slightly better.} } @article{01-NTCIR7-OV-PATMT-FujiiA, author={Atsushi Fujii and Masao Utiyama and Mikio Yamamoto and Takehito Utsuro}, title={Overview of the Patent Translation Task at the NTCIR-7 Workshop}, year=2008, abstract={To aid research and development in machine translation, we have produced a test collection for Japanese/English machine translation. To obtain a parallel corpus, we extracted patent documents for the same or related inventions published in Japan and the United States. Our test collection includes approximately 2 000 000 sentence pairs in Japanese and English, which were extracted automatically from our parallel corpus. These sentence pairs can be used to train and evaluate machine translation systems. Our test collection also includes search topics for cross-lingual patent retrieval, which can be used to evaluate the contribution of machine translation to retrieving patent documents across languages. This paper describes our test collection, methods for evaluating machine translation, and preliminary experiments. Our research is the first significant exploration into utilizing patent information for the evaluation of machine translations.} } @article{02-NTCIR7-PATMT-NakazawaT, author={Toshiaki Nakazawa and Sadao Kurohashi}, title={Kyoto-U: Syntactical EBMT System for NTCIR-7 Patent Translation Task}, year=2008, abstract={This paper describes "Kyoto-U" MT system that attended the patent translation task at NTCIR7. Example-based machine translation is applied in this system to integrate our study on both structural NLP and machine translation. In the alignment step, consistency criteria are applied to solve the alignment ambiguities and to discard incorrect alignment candidates. In the translation step, translation examples are combined using "bond" information, which can handle the word ordering without any statistics.} } @article{03-NTCIR7-PATMT-Katz-BrownJ, author={Jason Katz-Brown and Michael Collins}, title={Syntactic Reordering in Preprocessing for Japanese→English Translation: MIT System Description for NTCIR-7 Patent Translation Task}, year=2008, abstract={We experimented with a well-known technique of training a Japanese-to-English translation system on a Japanese training corpus that has been reordered into an English-like word order. We achieved surprisingly impressive results by naively reordering each Japanese sentence into reverse order. We also developed a reordering algorithm that transforms a Japanese dependency parse into English word order.} } @article{04-NTCIR7-PATMT-YasudaK, author={Keiji Yasuda and Andrew Finch and Hideo Okuma and Masao Utiyama and Hirofumi Yamamoto and Eiichiro Sumita}, title={System Description of NiCT-ATR SMT for NTCIR-7}, year=2008, abstract={In this paper we propose a method to improve SMT based patent translatioin. This method first employs International Patent Classification to build class based models. Then, multiple models are interpolated by weighting method employing source side language models. We carried out experiments using data from the patent translation cluster of NTCIR-7 workshop. According to the experimental results, the proposed method improved the most of automatic scores, which were NIST, WER and PER. Experimental results also shows BLUE score degradation in the proposed method. However, stastical tests by bootstrapping does not show significance for the degradation.} } @article{05-NTCIR7-PATMT-WatanabeT, author={Taro Watanabe and Hajime Tsukada and Hideki Isozaki}, title={NTT SMT System 2008 at NTCIR-7}, year=2008, abstract={This paper describes NTT SMT System 2008 presented at the patent translation track (PAT-MT) in NTCIR-7. For PAT-MT, we submitted our strong baseline system faithfully following a hierarchical phrase-based statistical machine translation. The hierarchical phrase-based SMT is based on a synchronous-CFGs in which a paired source/target rules are synchronously applied starting fro m the initial symbol. The decoding is realized by a CYK-style bottom-up parsing on the source side with each derivation representing a translation candidate. We demonstrate the strong baseline for the PAT-MT English/Japanese translations.} } @article{06-NTCIR7-PATMT-ChuangZ, author={Ze-Jing Chuang and Yuen-Hsien Tseng}, title={NTCIR-7 Experiments in Patent Translation based on Open Source Statistical Machine Translation Tools}, year=2008, abstract={This paper describes our experiment methods and results in the Seventh Patent Translation Task[1]. As the first step of our research in machine translation, we integrated a series of open source software to build a statistical translation model. The experiment results demonstrated that we still need to improve the performance and efficiency in both model training and testing.} } @article{07-NTCIR7-PATMT-ItoT, author={Takeshi Ito and Tomoyosi Akiba and Katunobu Itou}, title={Effect of the Topic Dependent Translation Models for Patent Translation - Experiment at NTCIR-7}, year=2008, abstract={In this paper, we investigate the effect of the topic dependent translation models for patent translation. For training the topic dependent translation models, we first divide the parallel sentences into the topic dependent clusters, then the sentences in each cluster are used to train the translation model of the topic that they belong to. At the time of translation, the topic of the source sentence is predicted by applying a document retrieval method, then the sentence is translated by using the translation model that corresponds to the predicted topic.} } @article{08-NTCIR7-PATMT-IzuhaT, author={Tatsuya Izuha and Akira Kumano and Yuka Kuroda}, title={Toshiba Rule-Based Machine Translation System at NTCIR-7 PAT MT}, year=2008, abstract={Toshiba (tsbmt) participated in the Patent Translation Task at NTCIR-7. We submitted two runs for Japanese-English intrinsic evaluation, one run for English-Japanese intrinsic evaluation and one run for extrinsic evaluation. The machine translation system used for those runs is rule-based one developed for translating open-domain written texts. A technical term dictionary for patent domain is used for all the runs as well as the common word dictionary. In addition, one of the two runs for Japanese-English intrinsic evaluation uses a dictionary built semi-automatically from the training data. Although it is not fair to compare translation quality between our system and purely statistical ones since the former uses extra knowledge (hand-crafted dictionary entries and rules) for all the runs, we believe that we have contributed to the research community by providing useful data. This paper describes the overview of our machine translation system.} } @article{09-NTCIR7-PATMT-KomachiM, author={Mamoru Komachi and Masaaki Nagata and Yuji Matsumoto}, title={NAIST-NTT System Description for Patent Translation Task at NTCIR-7}, year=2008, abstract={This paper proposes a semi-supervised approach to acquire domain specific translation knowledge from the collection of Wikipedia. The proposed method starts from a small number of seed translation pairs for each domain in a given corpus, and applies the regularized Laplacian to learn translation pairs relevant to the domain. This paper presents evaluation results using the NTCIR-7 Patent Translation Task.} } @article{10-NTCIR7-PATMT-KumaiH, author={Hiroyuki Kumai and Hirohiko Sagawa and Yasutsugu Morimoto}, title={NTCIR-7 Patent Translation Experiments at Hitachi}, year=2008, abstract={Statistical Machine Translation (SMT) is a new paradigm in machine translation, which enables high-quality translation. However, many translation errors occur in the translation of complex and compound sentences because of the lack of grammatical knowledge about the global structure of a sentence. We adopt the pre-editing method, which divides sentences into clauses, and translate these clauses using the Moses SMT engine. The translation accuracy, BLEU, was 29.33%, so pre-editing has a small effect. Translation quality is degraded because the order of words is changed by not using information about other clauses. We also performed an experiment to confirm the optimum distortion-limit parameter of Moses. The Maximum BLEU was 29.45 for an English-Japanese patent translation when the distortion limit was 20 instead of -1.} } @article{11-NTCIR7-PATMT-LiJ, author={Jin-Ji Li and Hwi-Dong Na and Hankyong Kim and Chang-Hu Jin and Jong-Hyeok Lee}, title={The POSTECH Statistical Machine Translation Systems for NTCIR-7 Patent Translation Task}, year=2008, abstract={This paper describes the POSTECH statistical machine translation (SMT) systems for the NTCIR-7 patent translation task. We entered two patent translation subtasks: Japanese-to-English, and English-to-Japanese translation. The baseline systems are derived from a common phrase-based SMT framework. In addition, for Japanese-to-English translation, we adopted a word reordering model for preprocessing and a cluster based model based on syntactic information of Japanese sentences.} } @article{12-NTCIR7-PATMT-LiK, author={Kai Li and Yuejie Zhang}, title={Efficient Statistical Machine Translation Algorithm based on IBM Model 4}, year=2008, abstract={This paper describes our methodologies for NTCIR-7 Patent Translation involving English and Japanese, and reports the official results. Our system was novel combination of machine translation algorithms including classical statistical method IBM model and highly efficient decoding algorithm. The result of this new method is relatively decent but the speed of it is fast. It can be considered as a candidate for such situations as people want to get a quick and simple grasp of the main idea of a text.} } @article{13-NTCIR7-PATMT-MurakamiJ, author={Jin'ichi Murakami and Masato Tokuhisa and Satoru Ikehara}, title={Statistical Machine Translation with Long Phrase Table and without Long Parallel Sentences}, year=2008, abstract={In this study, we paid attention to the reliability of phrase table. To make phrase table, We have been used Och's method[3]. And this method sometimes generate completely wrong phrase tables. We found that such phrase table caused by long parallel sentences. Therefore, we removed these long parallel sentences from training data. Also, we utilized general tools for statistical machine translation , such as "Giza++"[4], "moses"[5], and "training-phrase-model.perl"[6]. We obtained a BLEU score of 0.2229 of the Intrinsic-JE task and 0.2393 of the Intrinsic-EJ task for our proposed method. On the other hand, we obtained a BLEU score of 0.2162 of the Intrinsic-JE task and 0.2533 of the Intrinsic-EJ task for a standard method. This means that our proposed method was effective for the Intrinsic-JE task. However, it was not effective for the Intrinsic-EJ tasks. So, our system had average performance. For example, our system was the 20th place among 34 system for Intrinsic-JE task and the 12th place among 20 system for Intrinsic-EJ task.} } @article{14-NTCIR7-PATMT-OshioT, author={Tadaaki Oshio and Tomoharu Mitsuhashi and Tsuyoshi Kakita}, title={Use of the Technical Field-Oriented User Dictionaries}, year=2008, abstract={Japio performs various patent-related translation businesses, and owns the original patent-document-derived bilingual technical term database (Japio Terminology Database) to be served for the translators. Currently the database contains more than 780,000 J-E technical terms. To adapt the database to the NTCIR-7 Machine Translation Task, Japio compiled machine translation dictionaries from it. 34 technical field-oriented dictionaries were created based on Japio's original technical fields. Terms are evaluated and selected according to their actual frequency in the bilingual patent document corpus of concerned technical field.} } @article{15-NTCIR7-PATMT-YamamotoM, author={Mikio Yamamoto and Jyunya Norimatsu and Mitsuru Koshikawa and Takahiro Fukutomi and Taku Nishio and Kugatsu Sadamitsu and Takehito Utsuro and Masao Utiyama and Shunji Umetani and Tomomi Matsui}, title={Diversion of Hierarchical Phrases as Reordering Templates}, year=2008, abstract={In the hierarchical phrase-based translation (Chiang 2007), transaltion rules handle both context-sensitive translation and reordering of phrases at the same time. This simultaneity is strengths and weaknesses of the model. Although it enables the rules to be applied to the accurate and correct context, it deteriorates the applicability of the rules. In other words, the rules work very well in domains of training data, but they lost robustness in out of the domains. In this paper, we will try to improve the applicability of the original model by adding extra reordering rules which are separated out from hierachical pharase translation rules. An original hierarchical phrase translation rules with two non-terminals is regarded as either monotone or swap reordering template according to if the two non-terminals in the source side have monotone or swap relation to the target side in the original rule. We will describe experiments in which the original model compares with our extension in BLEU as a metric of translation quality using shared data at the NTCIR-7 patent translation task.} } @article{16-NTCIR7-PATMT-ZhuJ, author={Junguo Zhu and Haoliang Qi and Muyun Yang and Jufeng Li and Sheng Li}, title={Patent SMT Based on Combined Phrases for NTCIR-7}, year=2008, abstract={In this paper, we describe a combined phrase approach to the Statistical Machine Translation of Japanese patents into English. To conquer the rich segmentation errors caused by the overwhelming OOV (out-of-vocabulary) word in the patent texts, the character based translation phrases are first extracted to avoid word segmentation noise. Then the word based translation phrases are established to utilize the dependable word level information. Finally the two translation phrases tables are linearly combined to capture both character and word level translation correspondences. Preliminary experiments on NTCIR7 corpus indicate that the BLEU scores of the proposed method significantly out-performed the habitual word based approach.} } @article{01-NTCIR7-OV-MuST-KatoT, author={Tsuneaki Kato and Mitsunori Matsushita}, title={Overview of MuST at the NTCIR-7 Workshop -- Challenges to Multi-modal Summarization for Trend Information}, year=2008, abstract={The Workshop on Multi-modal Summarization for Trend information (MuST) encourages and fosters researches on summarizing trends, which constitutes an important first step of researches on technologies for supporting interactive and explorative information utilization. While MuST in the previous cycles provided the participants with a framework for encouraging and fostering researches by sharing the same research resource and by a common theme in a broad sense, MuST at NTCIR-7 has had an aspect of an evaluation workshop by picking up some themes many participants have been tackling and organizing those into evaluation subtasks. These two aspects of the workshop have helped to activate and push forward the researches: one shows a wide range of possilities and visions in this field, while the other confirms the state of accomplishment. In addition, shared resources have benn developed through the workshop. These activities are encouraging researchers on multi-model summarization of trend information.} } @article{01-NTCIR7-MuST-T-NanbaH, author={Hidetsugu Nanba}, title={Extraction of Trend Information from Newspaper Articles: Hiroshima City University at NTCIR-7 MuST}, year=2008, abstract={Trend information is a summarization of temporal statistical data, such as changes in product prices and sales. We propose a method for extracting trend information from multiple newspaper articles. Our group participated in the T2N Subtask at NTCIR-7 MuST (Multimodal Summarization for Trend Information). Our goal was to evaluate the effective-ness of our rule-based system using the data provided by NTCIR-7 MuST. From the experimental results, we obtained precision of 62.9% and recall of 7.0%.} } @article{02-NTCIR7-MuST-T-InoueT, author={Tatsukuni Inoue and Takashi Yamamoto and Makoto Toriyabe and Erina Shimizu and Hiroya Susuki and Hiroaki Saito}, title={Extraction of Chronological Statistics Using Domain Specific Knowledge}, year=2008, abstract={This paper reports a system which is constructed for our participation as a group of "keio01" of Keio University in the T2N (text to number) task at the MuST (Multimodal Summarization for Trend Information) workshop. The constructed system uses newspaper article corpora, task description and domain specific knowledge, and the system outputs chronological statistics. The statistics are ternary data which are pairs of a statistic name and pairs of a date and a value. They are available for visualization by drawing charts. The system was evaluated by precision and recall obtained through comparison to human extracted data, and it achieved 0.785 F-measure.} } @article{03-NTCIR7-MuST-T-MoriT, author={Tatsunori Mori and Rintaro Miyazaki}, title={A Simple Baseline Method for NTCIR-7 MuST T2N Task -Yokohama National University at NTCIR-7 MuST T2N-}, year=2008, abstract={We participated in the free task and the T2N task of NTCIR-7 MuST. In this paper, we will report our participation in T2N task. The system we prepared was a very simple and straightforward one. It will serve as a baseline for the T2N task. It consists of the following four modules: i) Element expression extractor, ii) Element expression combiner, iii) Date information canonicalizer, and iv) Selector of relevant statistical data. The main part of our system is the module (i) and a kind of chunk recognizer that is implemented in terms of a sequence labeling task for each character in the given text.} } @article{04-NTCIR7-MuST-T-UenishiY, author={Yasuhiro Uenishi and Fumito Masui and Tatsuaki Matsuba and Atsuo Kawai and Naoki Isu}, title={Trend Information Extraction Based on Relative Expression Participated on MuST T2N Subtask}, year=2008, abstract={This paper describes a system participating in the MuST T2N subtask. To the participating system, we applied the method of implicit trend information extraction utilizing relative expressions such as "0.1%増(grew 0.1%)", "前年(previous year)", "過去最高(maximum)". Relative differences and numerical changes in trend information can be signified by relative expressions. The system extracts elements of four types by pattern-based rules considering the relative expression. The extracted element is compared with the query word by identifying the synonym of the elements utilizing an EDR dictionary and some synonym databases. Some experiments were conducted with the MuST T2N formal run test collection. Although the results showed precision of 0.220 and recall of 0.029 totally, the outcomes of additional evaluations suggested the fundamental process performs effectively.} } @article{01-NTCIR7-MuST-F-KobayashiI, author={Ichiro Kobayashi and Naoko Okumura}, title={Text Generation for Explaining the Behavior of 2D Charts: With an Example of Stock Price Trends}, year=2008, abstract={We propose a method to generate a verbal report on the trends of stock price. The trends of the stock price is observed by the behavior of numerical data expressed in a 2D chart. Since its behavior reflects the shape of a chart, in order to recognize the behavior in qualitative and quantitative ways, we use least squares to mathematically recognize the shape and expressed it with words that often appear in news articles reporting the trends of stock prices. Our proposed method can change non-verbal information into verbal information --- this provide us with high accessibility and usability for various kinds of information.} } @article{02-NTCIR7-MuST-F-YoshidaM, author={Minoru Yoshida and Takahiro Sugiura and Takamasa Hirokawa and Kouichi Yamada and Hidetaka Masuda and Hiroshi Nakagawa}, title={TDU Systems for MuST: Attribute Name Extraction, Text-Based Stock Price Analysis, and Automatic Graph Generation}, year=2008, abstract={In this paper, we report our participation in MuST as the Tokyo Denki University team. We participated in MuST-F tasks with two systems: a system for attribute-name extraction and a system for analysis of relations between texts and stock prices. We also participated in the T2N task with a system for generating graphs from news articles automatically. We describe the algorithms of these three systems and discuss about the results of the T2N task.} } @article{03-NTCIR7-MuST-F-TakamaY, author={Yasufumi Takama and Takashi Yamada}, title={Interactive Information Visualization of Trend Information}, year=2008, abstract={This paper proposes the visualization cube as a reference model for spatiotemporal trend information. The interactive visualization system for earthquake trend information was proposed at MuST in NTCIR 6 workshop, which defines earthquake trend information as spatiotemporal information and designs the interaction between different views so that temporal and spatial trend information can be effectively combined. By extending the functionality of the system, this paper proposes the concept of visualization cube, which defines the data structure of spatiotemporal trend information, type of views, and interactive operations for generating views. The interactive operations for generating various views include drill down/up, comparison, spin, and transition. The interactive information visualization system for spatiotemporal trend information is also developed based on the concept of visualization cube. The system was used in actual classes of an elementary school, of which the result shows the system has enough usability for 5th-grade elementary school children to perform exploratory data analysis. It is expected the visualization cube contributes to the improvement of the visualization platform that is developed by MuST workshop.} } @article{04-NTCIR7-MuST-F-ItohT, author={Takayuki Itoh and Haruho Tachibana}, title={Visualization of Corpus Data by a Dual Hierarchical Data Visualization Technique}, year=2008, abstract={The paper presents a technique for visualization of corpus data consists of thousands of Japanese newspaper articles, and introduces several interesting trends discovered from the results. The technique first generates keyword-document matrices from the newspaper corpus, and respectively applies hierarhical clustering for rows and columns of the matrices. It then displays the two sets of clusters applying our own dual hierarchical data visualization technique. The visualization provides a mechanism to interact the two visualization components each other, so that users can freely explore the detail of the corpus data. This paper first describes the algorithm of the dual hierarchical data visualization technique, and then introduces our implementation and experiments of the visualization of the newspaper corpus data. } } @article{05-NTCIR7-MuST-F-IwataK, author={Kenichi Iwata and Mariko Sasakura}, title={A Method to Visualize Numerical Data with Geographical Information using Feathered Circles Painted by Color Gradation}, year=2008, abstract={This paper presents a method to visualize data with geographical information. Some kind of data has its value and subsidiary data of geographical information. A typical example of that kind of data is a value with location information where the value is measured. It helps viewers understanding them to show both of the values and geographical information at the same time. The method we propose plots data on a map using circles painted by color gradation of which edges are feathered. We also show algorithm and implementation of the method. The result of plotting data on a map shown in this paper is easy to understand.} } @article{06-NTCIR7-MuST-F-KawaiH, author={Hideki Kawai and Kazuo Kunieda and Keiji Yamada and Haruka Saito and Masaaki Tsuchida and Hironori Mizuguchi}, title={Visualization for Statistical Term Network in Newspaper}, year=2008, abstract={In this paper, we propose a visualization method for global dynamics. Global dynamics of various events and statistics are important to analyze complex international issues such as environmental, economic and political problems. We have been developing a system which can extract a co-occurrence network of statistical terms based on a suffix pattern matching of statistical terms. However, the network structure consisting of thousands of statistical terms is too complicated to understand their causal relations briefly. So we propose a method for simplifying the network structure based on network complexity and language expressions. Our experimental result shows that a clique of the statistical terms corresponds to a certain topic or issue and causal relations can be described as a chain of the cliques on the network structure.} } @article{07-NTCIR7-MuST-F-MurataM, author={Masaki Murata and Tamotsu Shirado and Kentaro Torisawa and Masakazu Iwatate and Koji Ichii and Qing Ma and Toshiyuki Kanamaru}, title={Sophisticated Text Mining System for Extracting and Visualizing Numerical and Named Entity Information from a Large Number of Documents}, year=2008, abstract={We have developed a system that can semiautomatically extract numerical and named entity sets from a large number of Japanese documents and can create various kinds of tables and graphs. In our experiments, our system semiautomatically created approximately 300 kinds of graphs and tables at precisions of 0.2--0.8 with only 2 h of manual preparation from a 2-year stack of newspapers articles. Note that these newspaper articles contained a large quantity of data, and all of them could not be read or checked manually in such a short amount of time. From this perspective, we concluded that our system is useful and convenient for extracting information from a large number of documents. We have constructed a demonstration system. In this paper, we briefly describe the demonstration system.} } @article{08-NTCIR7-MuST-F-RzepkaR, author={Rafal Rzepka and Masafumi Matsuhara and Yasutomo Kimura and Keiichi Takamaru and Hideyuki Shibuki and Koji Murakami}, title={Toward Automatic Support For Japanese Lay Judge System - Processing Precedent Factors For Sentencing Trends Discovery}, year=2008, abstract={In this paper we investigate factors that influence trends in sentencing process basing on newspaper articles in order to find lexical clues for automatic trial verdict estimation. We examine verdicts for murder cases of the last 10 years using 149 newspaper articles (1998-2001) and 160 precedents from a database available online (2001-2007). The results showed that there is a tendency to give stricter verdicts for murder cases and clustered by CLUTO into five classes were judged as descriptive factors related to a crime method and lethal weapon usage. We also use similarity between cases for sentence estimation after filtering out inadequate factors by using previously retrieved precedents. Finally we confirmed that using similarity-based weights gives less erroneous sentence estimation than the baseline method lacking those weights.} } @article{09-NTCIR7-MuST-F-UmanoM, author={Motohide Umano and Naoyuki Koizumi}, title={Verbal Expression of Time Series with Global Trend and Local Features}, year=2008, abstract={We have many kinds of data of time series such as stock prices. We understand them via their verbal expression in a natural language rather than conventional stochastic models. We propose a method to express a global trend and local features of time series in a natural language. A global trend is extracted via representative values, e.g. weighted averages, on the fuzzy intervals in the temporal axis and local features are specified as the position of large difference between the original data and the data generated from the global trend with fuzzy rule expressions. We apply the method to the data of Workshop on Multimodal Summarization for Trend Information (MuST). } }