E N D
1. 1 SlothLib??????????Web??????????????????? ?????
??????????Web?????
?????URL?Web??????????
?????????
????????
???????????
TF-IDF
?????????
2. 2 SlothLib?????????? ????
Visual Studio 2005
Visual [ C# | Basic | C++ ] 2005 Express Edition ???
SlothLib???????????
SlothLib???(dll?????)
SlothLib????????
http://www.dl.kuis.kyoto-u.ac.jp/slothlib/
?????????????????????????
???????
SlothLib????????????????????
3. 3 ????????????? ?????????
MeCab
http://sourceforge.net/project/showfiles.php?group_id=177856
?mecab-0.95.exe????????????????
????????????0.96????0.95
???????????
xdoc2txt
http://www31.ocn.ne.jp/~h_ishida/xdoc2txt.html
xdoc2txt 1.26 ( d2txt126.zip /101KB ) ???????
???????????????????
?) C:\usr\bin\xdoc2txt\
4. 4 Web?? // ??????Yahoo!????????????ID? slothlib
ISearch search = new YahooJpWebSearch("slothlib");
// ????????????????20???
ISearchResult sr = search.DoSearch("??", 20);
// URL?????????
List<string> urlList = new List<string>();
// ?????1?????
foreach (ISearchResultElement el in sr.ResultElements)
{
// URL?????????
urlList.Add(el.URL);
}
5. 5 Web?????????? // ????????Web???????????????
MultiWebGet multiWebGet = new MultiWebGet();
// ?????content-type???
// ?????HTML?PDF?WORD??????
string[] targetTypes =
{ "text/html",
"application/pdf",
"application/msword" };
// ????????Web??????????
MultiWebGetResults wgResults = multiWebGet.DoFetch(urlList.ToArray(),
targetTypes, null);
// content-type????????????
wgResults.ChangeExtension();
6. 6 ??????????? // ???????????????????
List<string> contentList = new List<string>();
// ???????????????????
MultiReader multiReader =
new MultiReader(@"C:\usr\bin\xdoc2txt\xdoc2txt.exe");
// ????????????????????
foreach (WebGetResult wgr in wgResults)
{
// ?????????????
try {
string content = multiReader.Read(wgr.LocalFilePath);
contentList.Add(content.Trim());
} catch { continue; }
}
7. 7 ????????????? // MeCab?????????????????????
MeCab mecab = new MeCab();
// ??????????????????????
IMorphemeFilter posFil = new PosFilter("??", ".*???");
// ????????????????
IMorphemeToStringFilter orgnFil = new RemainOriginalFilter();
// ????????????????
StopWordFilter swFil = new StopWordFilter();
// ???????????????
swFil.LoadWordList(@"C:\StopWord\word\");
swFil.LoadSymbolList(@"C:\StopWord\symbol\");
8. 8 ??????(TF)????? // ???????????List
List<IVector<string>> tfVectors = new List<IVector<string>>();
foreach (string c in contentList)
{
MeCabResult meResult = mecab.DoAnalyze(c); // MeCab???
IMorpheme[] onlyNoun = posFil.DoFilter(meResult.Morphemes);
string[] strArray;
strArray = orgnFil.DoFilter(onlyNoun); // ????????
strArray = swFil.DoFilter(strArray); // ??????????
if (strArray.Length > 0)
{
// ???????????????????
IVector<string> v = new FrequencyVector<string>(strArray);
tfVectors.Add(v);
}
}
9. 9 ??????(TF-IDF)????? // Document Frequency ????
DocumentFrequencyVector<string> df = new DocumentFrequencyVector<string>(tfVectors);
// IDF??????
IVector<string> idf = new InverseDFVector<string>(df);
List<IVector<string>> tfidfVectors = new List<IVector<string>>();
foreach (IVector<string> tf in tfVectors)
{
IVector<string> tfidf = new MultiplyVector<string>(tf, idf);
tfidfVectors.Add(tfidf);
}
10. 10 ?????????? // ?????????????????????
// ????????????????
IHierarchicalClustering<string> cmp = new CompleteLinkageClustering<string> (new CosineCalculator<string>());
// ?????????????=???????
HierarchicalClusteringResult<string> clusters = cmp.DoClustering(tfidfVectors.ToArray());
// ???0.05????????????????????
ClusteringResult<string> r = clusters.GetClusteringResult(0.05);
11. 11 ???? // ??????????
for (int i = 0; i < r.ClusterCount; i++)
{
Console.Write([ + i.ToString() + ]); // ?????????
// ???????????????????
int[] idxs = r.Clusters[i].GetIndices();
for (int j = 0; j < idxs.Length; j++)
{
Console.Write(", " + idxs[j].ToString());
}
Console.Write("\r\n ");
// ??????????????????10????
IVector<string>[] vs = r.Clusters[i].GetVectors(); // ????????????????
IVector<string> addv = new AddVector<string>(vs); // ???????????????
string[] sortedTerms = addv.GetSortedKeyList(); // ????????????????
for (int j = 0; j < sortedTerms.Length && j < 10; j++)
{
Console.Write(" " + sortedTerms[j]);
}
Console.WriteLine();
}
12. 12 ??? [0], 0, 4, 8
?? ?? ?? ?? ?? ?? ?? ??? ?? ??
[1], 1, 2, 3, 13
?? ?? ?? ?? ? ??? ?? ????? ???? ???
[2], 5, 11, 14
??? ????? ?? ?? ?? ???? ?? ??? ???? ??
[3], 6, 7, 12
KBS ?? ?? ?? ?? ?? ?? ?? ?? ??
[4], 9
??? ?? ?? ? ? ?? ???????? ?? ??? ??
[5], 10
Stay Access Wedding Restaurant Event News ENGLISH Party TEL
[6], 15, 17
px globalNavigation ?? wrapper margin container ffffff width background padding
[7], 16
??? ??? ? ?? ?? ? ?? ?? ?? ??