Java过滤停用词源码

整理文档很辛苦,赏杯茶钱您下走!

免费阅读已结束,点击下载阅读编辑剩下 ...

阅读已结束,您可以下载文档离线阅读编辑

资源描述

packageSimilarityCompution;importjava.io.BufferedReader;importjava.io.BufferedWriter;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileNotFoundException;importjava.io.FileOutputStream;importjava.io.InputStreamReader;importjava.io.OutputStreamWriter;importjava.util.HashSet;importjava.util.Set;importICTCLAS.I3S.AC.ICTCLAS50;publicclassFileExcludeStopWord{//停用词词表publicstaticfinalStringstopWordTable=.+File.separator+srcFile+File.separator+StopWordTable.txt;publicstaticvoidmain(String[]args){//源文件和目的文件StringsrcFile=.+File.separator+srcFile+File.separator+如何正确的使用化妆品效.txt;StringdestFile=.+File.separator+destFile+File.separator+如何正确的使用化妆品效.txt;newFileExcludeStopWord().fileExcludeStopWord(srcFile,destFile);}publicvoidfileExcludeStopWord(StringsrcFile,StringdestFile){try{//读取原文件和停用词表BufferedReadersrcFileBr=newBufferedReader(newInputStreamReader(newFileInputStream(newFile(srcFile))));BufferedReaderStopWordFileBr=newBufferedReader(newInputStreamReader(newFileInputStream(newFile(stopWordTable))));//将去除停用词的文本信息存入输出文件BufferedWriterdestFileBw=newBufferedWriter(newOutputStreamWriter(newFileOutputStream(newFile(destFile))));//用来存放停用词的集合SetstopWordSet=newHashSetString();//初如化停用词集StringstopWord=null;for(;(stopWord=StopWordFileBr.readLine())!=null;){stopWordSet.add(stopWord);}//分词工具ICTCLAS50ICTCLAS=newICTCLAS50();//初始化分词所用库的路径Stringargu=.;if(ICTCLAS.ICTCLAS_Init(argu.getBytes(gb2312))==false){System.out.println(分词所用库初始化失败。);return;}Stringparagraph=null;for(;(paragraph=srcFileBr.readLine())!=null;){//对读入的文本进行分词byte[]spiltResult=ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes(gb2312),2,0);StringspiltResultStr=newString(spiltResult,0,spiltResult.length,gb2312);//得到分词后的词汇数组,以便后续比较String[]resultArray=spiltResultStr.split();//过滤停用词for(inti=0;iresultArray.length;i++){if(stopWordSet.contains(resultArray[i])){resultArray[i]=null;}}//把过滤后的字符串数组存入到一个字符串中StringBufferfinalStr=newStringBuffer();for(inti=0;iresultArray.length;i++){if(resultArray[i]!=null){finalStr=finalStr.append(resultArray[i]).append();}}//将过滤后的文本信息写入到指定文件中destFileBw.write(finalStr.toString());destFileBw.newLine();}//关闭输入流destFileBw.close();StopWordFileBr.close();srcFileBr.close();}catch(FileNotFoundExceptione){//TODOAuto-generatedcatchblocke.printStackTrace();}catch(Exceptione){e.printStackTrace();}}}

1 / 3
下载文档,编辑使用

©2015-2020 m.777doc.com 三七文档.

备案号:鲁ICP备2024069028号-1 客服联系 QQ:2149211541

×
保存成功