1 / 12

SlothLib Web

2. SlothLib??????????. ????Visual Studio 2005Visual [ C

chelsea
Download Presentation

SlothLib Web

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


    1. 1 SlothLib????????? ?Web??????????????????? ????? ??????????Web????? ?????URL?Web?????????? ????????? ???????? ??????????? TF-IDF ?????????

    2. 2 SlothLib?????????? ???? Visual Studio 2005 Visual [ C# | Basic | C++ ] 2005 Express Edition ??? SlothLib??????????? SlothLib???(dll?????) SlothLib???????? http://www.dl.kuis.kyoto-u.ac.jp/slothlib/ ????????????????????????? ??????? SlothLib????????????????????

    3. 3 ????????????? ????????? MeCab http://sourceforge.net/project/showfiles.php?group_id=177856 ?mecab-0.95.exe???????????????? ????????????0.96????0.95 ??????????? xdoc2txt http://www31.ocn.ne.jp/~h_ishida/xdoc2txt.html xdoc2txt 1.26 ( d2txt126.zip /101KB ) ??????? ??????????????????? ?) C:\usr\bin\xdoc2txt\

    4. 4 Web?? // ??????Yahoo!????????????ID? slothlib ISearch search = new YahooJpWebSearch("slothlib"); // ????????????????20??? ISearchResult sr = search.DoSearch("??", 20); // URL????????? List<string> urlList = new List<string>(); // ?????1????? foreach (ISearchResultElement el in sr.ResultElements) { // URL????????? urlList.Add(el.URL); }

    5. 5 Web?????????? // ????????Web??????????????? MultiWebGet multiWebGet = new MultiWebGet(); // ?????content-type??? // ?????HTML?PDF?WORD?????? string[] targetTypes = { "text/html", "application/pdf", "application/msword" }; // ????????Web?????????? MultiWebGetResults wgResults = multiWebGet.DoFetch(urlList.ToArray(), targetTypes, null); // content-type???????????? wgResults.ChangeExtension();

    6. 6 ??????????? // ??????????????????? List<string> contentList = new List<string>(); // ??????????????????? MultiReader multiReader = new MultiReader(@"C:\usr\bin\xdoc2txt\xdoc2txt.exe"); // ???????????????????? foreach (WebGetResult wgr in wgResults) { // ????????????? try { string content = multiReader.Read(wgr.LocalFilePath); contentList.Add(content.Trim()); } catch { continue; } }

    7. 7 ????????????? // MeCab????????????????????? MeCab mecab = new MeCab(); // ?????????????????????? IMorphemeFilter posFil = new PosFilter("??", ".*???"); // ???????????????? IMorphemeToStringFilter orgnFil = new RemainOriginalFilter(); // ???????????????? StopWordFilter swFil = new StopWordFilter(); // ??????????????? swFil.LoadWordList(@"C:\StopWord\word\"); swFil.LoadSymbolList(@"C:\StopWord\symbol\");

    8. 8 ??????(TF)????? // ???????????List List<IVector<string>> tfVectors = new List<IVector<string>>(); foreach (string c in contentList) { MeCabResult meResult = mecab.DoAnalyze(c); // MeCab??? IMorpheme[] onlyNoun = posFil.DoFilter(meResult.Morphemes); string[] strArray; strArray = orgnFil.DoFilter(onlyNoun); // ???????? strArray = swFil.DoFilter(strArray); // ?????????? if (strArray.Length > 0) { // ??????????????????? IVector<string> v = new FrequencyVector<string>(strArray); tfVectors.Add(v); } }

    9. 9 ??????(TF-IDF)????? // Document Frequency ???? DocumentFrequencyVector<string> df = new DocumentFrequencyVector<string>(tfVectors); // IDF?????? IVector<string> idf = new InverseDFVector<string>(df); List<IVector<string>> tfidfVectors = new List<IVector<string>>(); foreach (IVector<string> tf in tfVectors) { IVector<string> tfidf = new MultiplyVector<string>(tf, idf); tfidfVectors.Add(tfidf); }

    10. 10 ?????????? // ????????????????????? // ???????????????? IHierarchicalClustering<string> cmp = new CompleteLinkageClustering<string> (new CosineCalculator<string>()); // ?????????????=??????? HierarchicalClusteringResult<string> clusters = cmp.DoClustering(tfidfVectors.ToArray()); // ???0.05???????????????????? ClusteringResult<string> r = clusters.GetClusteringResult(0.05);

    11. 11 ???? // ?????????? for (int i = 0; i < r.ClusterCount; i++) { Console.Write([ + i.ToString() + ]); // ????????? // ??????????????????? int[] idxs = r.Clusters[i].GetIndices(); for (int j = 0; j < idxs.Length; j++) { Console.Write(", " + idxs[j].ToString()); } Console.Write("\r\n "); // ??????????????????10???? IVector<string>[] vs = r.Clusters[i].GetVectors(); // ???????????????? IVector<string> addv = new AddVector<string>(vs); // ??????????????? string[] sortedTerms = addv.GetSortedKeyList(); // ???????????????? for (int j = 0; j < sortedTerms.Length && j < 10; j++) { Console.Write(" " + sortedTerms[j]); } Console.WriteLine(); }

    12. 12 ??? [0], 0, 4, 8 ?? ?? ?? ?? ?? ?? ?? ??? ?? ?? [1], 1, 2, 3, 13 ?? ?? ?? ?? ? ??? ?? ????? ???? ??? [2], 5, 11, 14 ??? ????? ?? ?? ?? ???? ?? ??? ???? ?? [3], 6, 7, 12 KBS ?? ?? ?? ?? ?? ?? ?? ?? ?? [4], 9 ??? ?? ?? ? ? ?? ???????? ?? ??? ?? [5], 10 Stay Access Wedding Restaurant Event News ENGLISH Party TEL [6], 15, 17 px globalNavigation ?? wrapper margin container ffffff width background padding [7], 16 ??? ??? ? ?? ?? ? ?? ?? ?? ??

More Related