TinySegmenterのphpを改造してみる。
TinySegmenter っていうjavascriptで実装されている分かち書きソフトがある。
これの php 版があるんだけど、文字の切り出しに mb_strsub を使っているのと、@でエラー抑制をしていたので少し書き換えてみた。
ついでに、私はUTF-8しか使わない予定なので、専用に改悪してしまったけど、こんな感じになった。
<?php /* PHP Version of TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/) TinySegmenter is super compact Japanese tokenizer. TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>. PHP Version was developed by xnights <programming.magic(at)gmail.com>. For details, see http://programming-magic.com/?id=172 */ class TinySegmenterarray{ private $patterns_ = array( "[一二三四五六七八九十百千万億兆]"=>"M", "[一-龠々〆ヵヶ]"=>"H", "[ぁ-ん]"=>"I", "[ァ-ヴーア-ン゙ー]"=>"K", "[a-zA-Za-zA-Z]"=>"A", "[0-90-9]"=>"N", ); private $BIAS__ = -332; private $BC1__ = array("HH"=>6,"II"=>2461,"KH"=>406,"OH"=>-1378); private $BC2__ = array("AA"=>-3267,"AI"=>2744,"AN"=>-878,"HH"=>-4070,"HM"=>-1711,"HN"=>4012,"HO"=>3761,"IA"=>1327,"IH"=>-1184,"II"=>-1332,"IK"=>1721,"IO"=>5492,"KI"=>3831,"KK"=>-8741,"MH"=>-3132,"MK"=>3334,"OO"=>-2920); private $BC3__ = array("HH"=>996,"HI"=>626,"HK"=>-721,"HN"=>-1307,"HO"=>-836,"IH"=>-301,"KK"=>2762,"MK"=>1079,"MM"=>4034,"OA"=>-1652,"OH"=>266); private $BP1__ = array("BB"=>295,"OB"=>304,"OO"=>-125,"UB"=>352); private $BP2__ = array("BO"=>60,"OO"=>-1762); private $BQ1__ = array("BHH"=>1150,"BHM"=>1521,"BII"=>-1158,"BIM"=>886,"BMH"=>1208,"BNH"=>449,"BOH"=>-91,"BOO"=>-2597,"OHI"=>451,"OIH"=>-296,"OKA"=>1851,"OKH"=>-1020,"OKK"=>904,"OOO"=>2965); private $BQ2__ = array("BHH"=>118,"BHI"=>-1159,"BHM"=>466,"BIH"=>-919,"BKK"=>-1720,"BKO"=>864,"OHH"=>-1139,"OHM"=>-181,"OIH"=>153,"UHI"=>-1146); private $BQ3__ = array("BHH"=>-792,"BHI"=>2664,"BII"=>-299,"BKI"=>419,"BMH"=>937,"BMM"=>8335,"BNN"=>998,"BOH"=>775,"OHH"=>2174,"OHM"=>439,"OII"=>280,"OKH"=>1798,"OKI"=>-793,"OKO"=>-2242,"OMH"=>-2402,"OOO"=>11699); private $BQ4__ = array("BHH"=>-3895,"BIH"=>3761,"BII"=>-4654,"BIK"=>1348,"BKK"=>-1806,"BMI"=>-3385,"BOO"=>-12396,"OAH"=>926,"OHH"=>266,"OHK"=>-2036,"ONN"=>-973); private $BW1__ = array(",と"=>660,",同"=>727,"B1あ"=>1404,"B1同"=>542,"、と"=>660,"、同"=>727,"」と"=>1682,"あっ"=>1505,"いう"=>1743,"いっ"=>-2055,"いる"=>672,"うし"=>-4817,"うん"=>665,"から"=>3472,"がら"=>600,"こう"=>-790,"こと"=>2083,"こん"=>-1262,"さら"=>-4143,"さん"=>4573,"した"=>2641,"して"=>1104,"すで"=>-3399,"そこ"=>1977,"それ"=>-871,"たち"=>1122,"ため"=>601,"った"=>3463,"つい"=>-802,"てい"=>805,"てき"=>1249,"でき"=>1127,"です"=>3445,"では"=>844,"とい"=>-4915,"とみ"=>1922,"どこ"=>3887,"ない"=>5713,"なっ"=>3015,"など"=>7379,"なん"=>-1113,"にし"=>2468,"には"=>1498,"にも"=>1671,"に対"=>-912,"の一"=>-501,"の中"=>741,"ませ"=>2448,"まで"=>1711,"まま"=>2600,"まる"=>-2155,"やむ"=>-1947,"よっ"=>-2565,"れた"=>2369,"れで"=>-913,"をし"=>1860,"を見"=>731,"亡く"=>-1886,"京都"=>2558,"取り"=>-2784,"大き"=>-2604,"大阪"=>1497,"平方"=>-2314,"引き"=>-1336,"日本"=>-195,"本当"=>-2423,"毎日"=>-2113,"目指"=>-724,"B1あ"=>1404,"B1同"=>542,"」と"=>1682); private $BW2__ = array(".."=>-11822,"11"=>-669,"――"=>-5730,"??"=>-13175,"いう"=>-1609,"うか"=>2490,"かし"=>-1350,"かも"=>-602,"から"=>-7194,"かれ"=>4612,"がい"=>853,"がら"=>-3198,"きた"=>1941,"くな"=>-1597,"こと"=>-8392,"この"=>-4193,"させ"=>4533,"され"=>13168,"さん"=>-3977,"しい"=>-1819,"しか"=>-545,"した"=>5078,"して"=>972,"しな"=>939,"その"=>-3744,"たい"=>-1253,"たた"=>-662,"ただ"=>-3857,"たち"=>-786,"たと"=>1224,"たは"=>-939,"った"=>4589,"って"=>1647,"っと"=>-2094,"てい"=>6144,"てき"=>3640,"てく"=>2551,"ては"=>-3110,"ても"=>-3065,"でい"=>2666,"でき"=>-1528,"でし"=>-3828,"です"=>-4761,"でも"=>-4203,"とい"=>1890,"とこ"=>-1746,"とと"=>-2279,"との"=>720,"とみ"=>5168,"とも"=>-3941,"ない"=>-2488,"なが"=>-1313,"など"=>-6509,"なの"=>2614,"なん"=>3099,"にお"=>-1615,"にし"=>2748,"にな"=>2454,"によ"=>-7236,"に対"=>-14943,"に従"=>-4688,"に関"=>-11388,"のか"=>2093,"ので"=>-7059,"のに"=>-6041,"のの"=>-6125,"はい"=>1073,"はが"=>-1033,"はず"=>-2532,"ばれ"=>1813,"まし"=>-1316,"まで"=>-6621,"まれ"=>5409,"めて"=>-3153,"もい"=>2230,"もの"=>-10713,"らか"=>-944,"らし"=>-1611,"らに"=>-1897,"りし"=>651,"りま"=>1620,"れた"=>4270,"れて"=>849,"れば"=>4114,"ろう"=>6067,"われ"=>7901,"を通"=>-11877,"んだ"=>728,"んな"=>-4115,"一人"=>602,"一方"=>-1375,"一日"=>970,"一部"=>-1051,"上が"=>-4479,"会社"=>-1116,"出て"=>2163,"分の"=>-7758,"同党"=>970,"同日"=>-913,"大阪"=>-2471,"委員"=>-1250,"少な"=>-1050,"年度"=>-8669,"年間"=>-1626,"府県"=>-2363,"手権"=>-1982,"新聞"=>-4066,"日新"=>-722,"日本"=>-7068,"日米"=>3372,"曜日"=>-601,"朝鮮"=>-2355,"本人"=>-2697,"東京"=>-1543,"然と"=>-1384,"社会"=>-1276,"立て"=>-990,"第に"=>-1612,"米国"=>-4268,"11"=>-669); private $BW3__ = array("あた"=>-2194,"あり"=>719,"ある"=>3846,"い."=>-1185,"い。"=>-1185,"いい"=>5308,"いえ"=>2079,"いく"=>3029,"いた"=>2056,"いっ"=>1883,"いる"=>5600,"いわ"=>1527,"うち"=>1117,"うと"=>4798,"えと"=>1454,"か."=>2857,"か。"=>2857,"かけ"=>-743,"かっ"=>-4098,"かに"=>-669,"から"=>6520,"かり"=>-2670,"が,"=>1816,"が、"=>1816,"がき"=>-4855,"がけ"=>-1127,"がっ"=>-913,"がら"=>-4977,"がり"=>-2064,"きた"=>1645,"けど"=>1374,"こと"=>7397,"この"=>1542,"ころ"=>-2757,"さい"=>-714,"さを"=>976,"し,"=>1557,"し、"=>1557,"しい"=>-3714,"した"=>3562,"して"=>1449,"しな"=>2608,"しま"=>1200,"す."=>-1310,"す。"=>-1310,"する"=>6521,"ず,"=>3426,"ず、"=>3426,"ずに"=>841,"そう"=>428,"た."=>8875,"た。"=>8875,"たい"=>-594,"たの"=>812,"たり"=>-1183,"たる"=>-853,"だ."=>4098,"だ。"=>4098,"だっ"=>1004,"った"=>-4748,"って"=>300,"てい"=>6240,"てお"=>855,"ても"=>302,"です"=>1437,"でに"=>-1482,"では"=>2295,"とう"=>-1387,"とし"=>2266,"との"=>541,"とも"=>-3543,"どう"=>4664,"ない"=>1796,"なく"=>-903,"など"=>2135,"に,"=>-1021,"に、"=>-1021,"にし"=>1771,"にな"=>1906,"には"=>2644,"の,"=>-724,"の、"=>-724,"の子"=>-1000,"は,"=>1337,"は、"=>1337,"べき"=>2181,"まし"=>1113,"ます"=>6943,"まっ"=>-1549,"まで"=>6154,"まれ"=>-793,"らし"=>1479,"られ"=>6820,"るる"=>3818,"れ,"=>854,"れ、"=>854,"れた"=>1850,"れて"=>1375,"れば"=>-3246,"れる"=>1091,"われ"=>-605,"んだ"=>606,"んで"=>798,"カ月"=>990,"会議"=>860,"入り"=>1232,"大会"=>2217,"始め"=>1681,"市"=>965,"新聞"=>-5055,"日,"=>974,"日、"=>974,"社会"=>2024,"カ月"=>990); private $TC1__ = array("AAA"=>1093,"HHH"=>1029,"HHM"=>580,"HII"=>998,"HOH"=>-390,"HOM"=>-331,"IHI"=>1169,"IOH"=>-142,"IOI"=>-1015,"IOM"=>467,"MMH"=>187,"OOI"=>-1832); private $TC2__ = array("HHO"=>2088,"HII"=>-1023,"HMM"=>-1154,"IHI"=>-1965,"KKH"=>703,"OII"=>-2649); private $TC3__ = array("AAA"=>-294,"HHH"=>346,"HHI"=>-341,"HII"=>-1088,"HIK"=>731,"HOH"=>-1486,"IHH"=>128,"IHI"=>-3041,"IHO"=>-1935,"IIH"=>-825,"IIM"=>-1035,"IOI"=>-542,"KHH"=>-1216,"KKA"=>491,"KKH"=>-1217,"KOK"=>-1009,"MHH"=>-2694,"MHM"=>-457,"MHO"=>123,"MMH"=>-471,"NNH"=>-1689,"NNO"=>662,"OHO"=>-3393); private $TC4__ = array("HHH"=>-203,"HHI"=>1344,"HHK"=>365,"HHM"=>-122,"HHN"=>182,"HHO"=>669,"HIH"=>804,"HII"=>679,"HOH"=>446,"IHH"=>695,"IHO"=>-2324,"IIH"=>321,"III"=>1497,"IIO"=>656,"IOO"=>54,"KAK"=>4845,"KKA"=>3386,"KKK"=>3065,"MHH"=>-405,"MHI"=>201,"MMH"=>-241,"MMM"=>661,"MOM"=>841); private $TQ1__ = array("BHHH"=>-227,"BHHI"=>316,"BHIH"=>-132,"BIHH"=>60,"BIII"=>1595,"BNHH"=>-744,"BOHH"=>225,"BOOO"=>-908,"OAKK"=>482,"OHHH"=>281,"OHIH"=>249,"OIHI"=>200,"OIIH"=>-68); private $TQ2__ = array("BIHH"=>-1401,"BIII"=>-1033,"BKAK"=>-543,"BOOO"=>-5591); private $TQ3__ = array("BHHH"=>478,"BHHM"=>-1073,"BHIH"=>222,"BHII"=>-504,"BIIH"=>-116,"BIII"=>-105,"BMHI"=>-863,"BMHM"=>-464,"BOMH"=>620,"OHHH"=>346,"OHHI"=>1729,"OHII"=>997,"OHMH"=>481,"OIHH"=>623,"OIIH"=>1344,"OKAK"=>2792,"OKHH"=>587,"OKKA"=>679,"OOHH"=>110,"OOII"=>-685); private $TQ4__ = array("BHHH"=>-721,"BHHM"=>-3604,"BHII"=>-966,"BIIH"=>-607,"BIII"=>-2181,"OAAA"=>-2763,"OAKK"=>180,"OHHH"=>-294,"OHHI"=>2446,"OHHO"=>480,"OHIH"=>-1573,"OIHH"=>1935,"OIHI"=>-493,"OIIH"=>626,"OIII"=>-4007,"OKAK"=>-8156); private $TW1__ = array("につい"=>-4681,"東京都"=>2026); private $TW2__ = array("ある程"=>-2049,"いった"=>-1256,"ころが"=>-2434,"しょう"=>3873,"その後"=>-4430,"だって"=>-1049,"ていた"=>1833,"として"=>-4657,"ともに"=>-4517,"もので"=>1882,"一気に"=>-792,"初めて"=>-1512,"同時に"=>-8097,"大きな"=>-1255,"対して"=>-2721,"社会党"=>-3216); private $TW3__ = array("いただ"=>-1734,"してい"=>1314,"として"=>-4314,"につい"=>-5483,"にとっ"=>-5989,"に当た"=>-6247,"ので,"=>-727,"ので、"=>-727,"のもの"=>-600,"れから"=>-3752,"十二月"=>-2287); private $TW4__ = array("いう."=>8576,"いう。"=>8576,"からな"=>-2348,"してい"=>2958,"たが,"=>1516,"たが、"=>1516,"ている"=>1538,"という"=>1349,"ました"=>5543,"ません"=>1097,"ようと"=>-4258,"よると"=>5865); private $UC1__ = array("A"=>484,"K"=>93,"M"=>645,"O"=>-505); private $UC2__ = array("A"=>819,"H"=>1059,"I"=>409,"M"=>3987,"N"=>5775,"O"=>646); private $UC3__ = array("A"=>-1370,"I"=>2311); private $UC4__ = array("A"=>-2643,"H"=>1809,"I"=>-1032,"K"=>-3450,"M"=>3565,"N"=>3876,"O"=>6646); private $UC5__ = array("H"=>313,"I"=>-1238,"K"=>-799,"M"=>539,"O"=>-831); private $UC6__ = array("H"=>-506,"I"=>-253,"K"=>87,"M"=>247,"O"=>-387); private $UP1__ = array("O"=>-214); private $UP2__ = array("B"=>69,"O"=>935); private $UP3__ = array("B"=>189); private $UQ1__ = array("BH"=>21,"BI"=>-12,"BK"=>-99,"BN"=>142,"BO"=>-56,"OH"=>-95,"OI"=>477,"OK"=>410,"OO"=>-2422); private $UQ2__ = array("BH"=>216,"BI"=>113,"OK"=>1759); private $UQ3__ = array("BA"=>-479,"BH"=>42,"BI"=>1913,"BK"=>-7198,"BM"=>3160,"BN"=>6427,"BO"=>14761,"OI"=>-827,"ON"=>-3212); private $UW1__ = array(","=>156,"、"=>156,"「"=>-463,"あ"=>-941,"う"=>-127,"が"=>-553,"き"=>121,"こ"=>505,"で"=>-201,"と"=>-547,"ど"=>-123,"に"=>-789,"の"=>-185,"は"=>-847,"も"=>-466,"や"=>-470,"よ"=>182,"ら"=>-292,"り"=>208,"れ"=>169,"を"=>-446,"ん"=>-137,"・"=>-135,"主"=>-402,"京"=>-268,"区"=>-912,"午"=>871,"国"=>-460,"大"=>561,"委"=>729,"市"=>-411,"日"=>-141,"理"=>361,"生"=>-408,"県"=>-386,"都"=>-718,"「"=>-463,"・"=>-135); private $UW2__ = array(","=>-829,"、"=>-829,"〇"=>892,"「"=>-645,"」"=>3145,"あ"=>-538,"い"=>505,"う"=>134,"お"=>-502,"か"=>1454,"が"=>-856,"く"=>-412,"こ"=>1141,"さ"=>878,"ざ"=>540,"し"=>1529,"す"=>-675,"せ"=>300,"そ"=>-1011,"た"=>188,"だ"=>1837,"つ"=>-949,"て"=>-291,"で"=>-268,"と"=>-981,"ど"=>1273,"な"=>1063,"に"=>-1764,"の"=>130,"は"=>-409,"ひ"=>-1273,"べ"=>1261,"ま"=>600,"も"=>-1263,"や"=>-402,"よ"=>1639,"り"=>-579,"る"=>-694,"れ"=>571,"を"=>-2516,"ん"=>2095,"ア"=>-587,"カ"=>306,"キ"=>568,"ッ"=>831,"三"=>-758,"不"=>-2150,"世"=>-302,"中"=>-968,"主"=>-861,"事"=>492,"人"=>-123,"会"=>978,"保"=>362,"入"=>548,"初"=>-3025,"副"=>-1566,"北"=>-3414,"区"=>-422,"大"=>-1769,"天"=>-865,"太"=>-483,"子"=>-1519,"学"=>760,"実"=>1023,"小"=>-2009,"市"=>-813,"年"=>-1060,"強"=>1067,"手"=>-1519,"揺"=>-1033,"政"=>1522,"文"=>-1355,"新"=>-1682,"日"=>-1815,"明"=>-1462,"最"=>-630,"朝"=>-1843,"本"=>-1650,"東"=>-931,"果"=>-665,"次"=>-2378,"民"=>-180,"気"=>-1740,"理"=>752,"発"=>529,"目"=>-1584,"相"=>-242,"県"=>-1165,"立"=>-763,"第"=>810,"米"=>509,"自"=>-1353,"行"=>838,"西"=>-744,"見"=>-3874,"調"=>1010,"議"=>1198,"込"=>3041,"開"=>1758,"間"=>-1257,"「"=>-645,"」"=>3145,"ッ"=>831,"ア"=>-587,"カ"=>306,"キ"=>568); private $UW3__ = array(","=>4889,"1"=>-800,"?"=>-1723,"、"=>4889,"々"=>-2311,"〇"=>5827,"」"=>2670,"〓"=>-3573,"あ"=>-2696,"い"=>1006,"う"=>2342,"え"=>1983,"お"=>-4864,"か"=>-1163,"が"=>3271,"く"=>1004,"け"=>388,"げ"=>401,"こ"=>-3552,"ご"=>-3116,"さ"=>-1058,"し"=>-395,"す"=>584,"せ"=>3685,"そ"=>-5228,"た"=>842,"ち"=>-521,"っ"=>-1444,"つ"=>-1081,"て"=>6167,"で"=>2318,"と"=>1691,"ど"=>-899,"な"=>-2788,"に"=>2745,"の"=>4056,"は"=>4555,"ひ"=>-2171,"ふ"=>-1798,"へ"=>1199,"ほ"=>-5516,"ま"=>-4384,"み"=>-120,"め"=>1205,"も"=>2323,"や"=>-788,"よ"=>-202,"ら"=>727,"り"=>649,"る"=>5905,"れ"=>2773,"わ"=>-1207,"を"=>6620,"ん"=>-518,"ア"=>551,"グ"=>1319,"ス"=>874,"ッ"=>-1350,"ト"=>521,"ム"=>1109,"ル"=>1591,"ロ"=>2201,"ン"=>278,"・"=>-3794,"一"=>-1619,"下"=>-1759,"世"=>-2087,"両"=>3815,"中"=>653,"主"=>-758,"予"=>-1193,"二"=>974,"人"=>2742,"今"=>792,"他"=>1889,"以"=>-1368,"低"=>811,"何"=>4265,"作"=>-361,"保"=>-2439,"元"=>4858,"党"=>3593,"全"=>1574,"公"=>-3030,"六"=>755,"共"=>-1880,"円"=>5807,"再"=>3095,"分"=>457,"初"=>2475,"別"=>1129,"前"=>2286,"副"=>4437,"力"=>365,"動"=>-949,"務"=>-1872,"化"=>1327,"北"=>-1038,"区"=>4646,"千"=>-2309,"午"=>-783,"協"=>-1006,"口"=>483,"右"=>1233,"各"=>3588,"合"=>-241,"同"=>3906,"和"=>-837,"員"=>4513,"国"=>642,"型"=>1389,"場"=>1219,"外"=>-241,"妻"=>2016,"学"=>-1356,"安"=>-423,"実"=>-1008,"家"=>1078,"小"=>-513,"少"=>-3102,"州"=>1155,"市"=>3197,"平"=>-1804,"年"=>2416,"広"=>-1030,"府"=>1605,"度"=>1452,"建"=>-2352,"当"=>-3885,"得"=>1905,"思"=>-1291,"性"=>1822,"戸"=>-488,"指"=>-3973,"政"=>-2013,"教"=>-1479,"数"=>3222,"文"=>-1489,"新"=>1764,"日"=>2099,"旧"=>5792,"昨"=>-661,"時"=>-1248,"曜"=>-951,"最"=>-937,"月"=>4125,"期"=>360,"李"=>3094,"村"=>364,"東"=>-805,"核"=>5156,"森"=>2438,"業"=>484,"氏"=>2613,"民"=>-1694,"決"=>-1073,"法"=>1868,"海"=>-495,"無"=>979,"物"=>461,"特"=>-3850,"生"=>-273,"用"=>914,"町"=>1215,"的"=>7313,"直"=>-1835,"省"=>792,"県"=>6293,"知"=>-1528,"私"=>4231,"税"=>401,"立"=>-960,"第"=>1201,"米"=>7767,"系"=>3066,"約"=>3663,"級"=>1384,"統"=>-4229,"総"=>1163,"線"=>1255,"者"=>6457,"能"=>725,"自"=>-2869,"英"=>785,"見"=>1044,"調"=>-562,"財"=>-733,"費"=>1777,"車"=>1835,"軍"=>1375,"込"=>-1504,"通"=>-1136,"選"=>-681,"郎"=>1026,"郡"=>4404,"部"=>1200,"金"=>2163,"長"=>421,"開"=>-1432,"間"=>1302,"関"=>-1282,"雨"=>2009,"電"=>-1045,"非"=>2066,"駅"=>1620,"1"=>-800,"」"=>2670,"・"=>-3794,"ッ"=>-1350,"ア"=>551,"グ"=>1319,"ス"=>874,"ト"=>521,"ム"=>1109,"ル"=>1591,"ロ"=>2201,"ン"=>278); private $UW4__ = array(","=>3930,"."=>3508,"―"=>-4841,"、"=>3930,"。"=>3508,"〇"=>4999,"「"=>1895,"」"=>3798,"〓"=>-5156,"あ"=>4752,"い"=>-3435,"う"=>-640,"え"=>-2514,"お"=>2405,"か"=>530,"が"=>6006,"き"=>-4482,"ぎ"=>-3821,"く"=>-3788,"け"=>-4376,"げ"=>-4734,"こ"=>2255,"ご"=>1979,"さ"=>2864,"し"=>-843,"じ"=>-2506,"す"=>-731,"ず"=>1251,"せ"=>181,"そ"=>4091,"た"=>5034,"だ"=>5408,"ち"=>-3654,"っ"=>-5882,"つ"=>-1659,"て"=>3994,"で"=>7410,"と"=>4547,"な"=>5433,"に"=>6499,"ぬ"=>1853,"ね"=>1413,"の"=>7396,"は"=>8578,"ば"=>1940,"ひ"=>4249,"び"=>-4134,"ふ"=>1345,"へ"=>6665,"べ"=>-744,"ほ"=>1464,"ま"=>1051,"み"=>-2082,"む"=>-882,"め"=>-5046,"も"=>4169,"ゃ"=>-2666,"や"=>2795,"ょ"=>-1544,"よ"=>3351,"ら"=>-2922,"り"=>-9726,"る"=>-14896,"れ"=>-2613,"ろ"=>-4570,"わ"=>-1783,"を"=>13150,"ん"=>-2352,"カ"=>2145,"コ"=>1789,"セ"=>1287,"ッ"=>-724,"ト"=>-403,"メ"=>-1635,"ラ"=>-881,"リ"=>-541,"ル"=>-856,"ン"=>-3637,"・"=>-4371,"ー"=>-11870,"一"=>-2069,"中"=>2210,"予"=>782,"事"=>-190,"井"=>-1768,"人"=>1036,"以"=>544,"会"=>950,"体"=>-1286,"作"=>530,"側"=>4292,"先"=>601,"党"=>-2006,"共"=>-1212,"内"=>584,"円"=>788,"初"=>1347,"前"=>1623,"副"=>3879,"力"=>-302,"動"=>-740,"務"=>-2715,"化"=>776,"区"=>4517,"協"=>1013,"参"=>1555,"合"=>-1834,"和"=>-681,"員"=>-910,"器"=>-851,"回"=>1500,"国"=>-619,"園"=>-1200,"地"=>866,"場"=>-1410,"塁"=>-2094,"士"=>-1413,"多"=>1067,"大"=>571,"子"=>-4802,"学"=>-1397,"定"=>-1057,"寺"=>-809,"小"=>1910,"屋"=>-1328,"山"=>-1500,"島"=>-2056,"川"=>-2667,"市"=>2771,"年"=>374,"庁"=>-4556,"後"=>456,"性"=>553,"感"=>916,"所"=>-1566,"支"=>856,"改"=>787,"政"=>2182,"教"=>704,"文"=>522,"方"=>-856,"日"=>1798,"時"=>1829,"最"=>845,"月"=>-9066,"木"=>-485,"来"=>-442,"校"=>-360,"業"=>-1043,"氏"=>5388,"民"=>-2716,"気"=>-910,"沢"=>-939,"済"=>-543,"物"=>-735,"率"=>672,"球"=>-1267,"生"=>-1286,"産"=>-1101,"田"=>-2900,"町"=>1826,"的"=>2586,"目"=>922,"省"=>-3485,"県"=>2997,"空"=>-867,"立"=>-2112,"第"=>788,"米"=>2937,"系"=>786,"約"=>2171,"経"=>1146,"統"=>-1169,"総"=>940,"線"=>-994,"署"=>749,"者"=>2145,"能"=>-730,"般"=>-852,"行"=>-792,"規"=>792,"警"=>-1184,"議"=>-244,"谷"=>-1000,"賞"=>730,"車"=>-1481,"軍"=>1158,"輪"=>-1433,"込"=>-3370,"近"=>929,"道"=>-1291,"選"=>2596,"郎"=>-4866,"都"=>1192,"野"=>-1100,"銀"=>-2213,"長"=>357,"間"=>-2344,"院"=>-2297,"際"=>-2604,"電"=>-878,"領"=>-1659,"題"=>-792,"館"=>-1984,"首"=>1749,"高"=>2120,"「"=>1895,"」"=>3798,"・"=>-4371,"ッ"=>-724,"ー"=>-11870,"カ"=>2145,"コ"=>1789,"セ"=>1287,"ト"=>-403,"メ"=>-1635,"ラ"=>-881,"リ"=>-541,"ル"=>-856,"ン"=>-3637); private $UW5__ = array(","=>465,"."=>-299,"1"=>-514,"E2"=>-32768,"]"=>-2762,"、"=>465,"。"=>-299,"「"=>363,"あ"=>1655,"い"=>331,"う"=>-503,"え"=>1199,"お"=>527,"か"=>647,"が"=>-421,"き"=>1624,"ぎ"=>1971,"く"=>312,"げ"=>-983,"さ"=>-1537,"し"=>-1371,"す"=>-852,"だ"=>-1186,"ち"=>1093,"っ"=>52,"つ"=>921,"て"=>-18,"で"=>-850,"と"=>-127,"ど"=>1682,"な"=>-787,"に"=>-1224,"の"=>-635,"は"=>-578,"べ"=>1001,"み"=>502,"め"=>865,"ゃ"=>3350,"ょ"=>854,"り"=>-208,"る"=>429,"れ"=>504,"わ"=>419,"を"=>-1264,"ん"=>327,"イ"=>241,"ル"=>451,"ン"=>-343,"中"=>-871,"京"=>722,"会"=>-1153,"党"=>-654,"務"=>3519,"区"=>-901,"告"=>848,"員"=>2104,"大"=>-1296,"学"=>-548,"定"=>1785,"嵐"=>-1304,"市"=>-2991,"席"=>921,"年"=>1763,"思"=>872,"所"=>-814,"挙"=>1618,"新"=>-1682,"日"=>218,"月"=>-4353,"査"=>932,"格"=>1356,"機"=>-1508,"氏"=>-1347,"田"=>240,"町"=>-3912,"的"=>-3149,"相"=>1319,"省"=>-1052,"県"=>-4003,"研"=>-997,"社"=>-278,"空"=>-813,"統"=>1955,"者"=>-2233,"表"=>663,"語"=>-1073,"議"=>1219,"選"=>-1018,"郎"=>-368,"長"=>786,"間"=>1191,"題"=>2368,"館"=>-689,"1"=>-514,"E2"=>-32768,"「"=>363,"イ"=>241,"ル"=>451,"ン"=>-343); private $UW6__ = array(","=>227,"."=>808,"1"=>-270,"E1"=>306,"、"=>227,"。"=>808,"あ"=>-307,"う"=>189,"か"=>241,"が"=>-73,"く"=>-121,"こ"=>-200,"じ"=>1782,"す"=>383,"た"=>-428,"っ"=>573,"て"=>-1014,"で"=>101,"と"=>-105,"な"=>-253,"に"=>-149,"の"=>-417,"は"=>-236,"も"=>-206,"り"=>187,"る"=>-135,"を"=>195,"ル"=>-673,"ン"=>-496,"一"=>-277,"中"=>201,"件"=>-800,"会"=>624,"前"=>302,"区"=>1792,"員"=>-1212,"委"=>798,"学"=>-960,"市"=>887,"広"=>-695,"後"=>535,"業"=>-697,"相"=>753,"社"=>-507,"福"=>974,"空"=>-822,"者"=>1811,"連"=>463,"郎"=>1082,"1"=>-270,"E1"=>306,"ル"=>-673,"ン"=>-496); private function ctype_($str){ foreach($this->patterns_ as $pattern => $type){ if(preg_match('/'.$pattern.'/u', $str)){ return $type; } } return "O"; } private function ts_(&$v){ if(isset($v) && $v){return $v;} return 0; } public function segment($input){ if(!$input){ return array(); } $result = array(); $seg = array("B3","B2","B1"); $ctype = array("O","O","O"); $o = preg_split("//u", $input, -1, PREG_SPLIT_NO_EMPTY); for($i = 0; $i<count($o); ++$i) { $seg[] = $o[$i]; $ctype[] = $this->ctype_($o[$i]); } $seg[] = "E1"; $seg[] = "E2"; $seg[] = "E3"; $ctype[] = "O"; $ctype[] = "O"; $ctype[] = "O"; $word = $seg[3]; $p1 = "U"; $p2 = "U"; $p3 = "U"; for($i = 4; $i<count($seg)-3; ++$i){ $score = $this->BIAS__; $w1 = $seg[$i-3]; $w2 = $seg[$i-2]; $w3 = $seg[$i-1]; $w4 = $seg[$i]; $w5 = $seg[$i+1]; $w6 = $seg[$i+2]; $c1 = $ctype[$i-3]; $c2 = $ctype[$i-2]; $c3 = $ctype[$i-1]; $c4 = $ctype[$i]; $c5 = $ctype[$i+1]; $c6 = $ctype[$i+2]; $score += $this->ts_($this->UP1__[$p1]); $score += $this->ts_($this->UP2__[$p2]); $score += $this->ts_($this->UP3__[$p3]); $score += $this->ts_($this->BP1__[$p1 . $p2]); $score += $this->ts_($this->BP2__[$p2 . $p3]); $score += $this->ts_($this->UW1__[$w1]); $score += $this->ts_($this->UW2__[$w2]); $score += $this->ts_($this->UW3__[$w3]); $score += $this->ts_($this->UW4__[$w4]); $score += $this->ts_($this->UW5__[$w5]); $score += $this->ts_($this->UW6__[$w6]); $score += $this->ts_($this->BW1__[$w2 . $w3]); $score += $this->ts_($this->BW2__[$w3 . $w4]); $score += $this->ts_($this->BW3__[$w4 . $w5]); $score += $this->ts_($this->TW1__[$w1 . $w2 . $w3]); $score += $this->ts_($this->TW2__[$w2 . $w3 . $w4]); $score += $this->ts_($this->TW3__[$w3 . $w4 . $w5]); $score += $this->ts_($this->TW4__[$w4 . $w5 . $w6]); $score += $this->ts_($this->UC1__[$c1]); $score += $this->ts_($this->UC2__[$c2]); $score += $this->ts_($this->UC3__[$c3]); $score += $this->ts_($this->UC4__[$c4]); $score += $this->ts_($this->UC5__[$c5]); $score += $this->ts_($this->UC6__[$c6]); $score += $this->ts_($this->BC1__[$c2 . $c3]); $score += $this->ts_($this->BC2__[$c3 . $c4]); $score += $this->ts_($this->BC3__[$c4 . $c5]); $score += $this->ts_($this->TC1__[$c1 . $c2 . $c3]); $score += $this->ts_($this->TC2__[$c2 . $c3 . $c4]); $score += $this->ts_($this->TC3__[$c3 . $c4 . $c5]); $score += $this->ts_($this->TC4__[$c4 . $c5 . $c6]); // $score += $this->ts_($this->TC5__[$c4 . $c5 . $c6]); $score += $this->ts_($this->UQ1__[$p1 . $c1]); $score += $this->ts_($this->UQ2__[$p2 . $c2]); $score += $this->ts_($this->UQ1__[$p3 . $c3]); $score += $this->ts_($this->BQ1__[$p2 . $c2 . $c3]); $score += $this->ts_($this->BQ2__[$p2 . $c3 . $c4]); $score += $this->ts_($this->BQ3__[$p3 . $c2 . $c3]); $score += $this->ts_($this->BQ4__[$p3 . $c3 . $c4]); $score += $this->ts_($this->TQ1__[$p2 . $c1 . $c2 . $c3]); $score += $this->ts_($this->TQ2__[$p2 . $c2 . $c3 . $c4]); $score += $this->ts_($this->TQ3__[$p3 . $c1 . $c2 . $c3]); $score += $this->ts_($this->TQ4__[$p3 . $c2 . $c3 . $c4]); $p = "O"; if($score > 0){ $result[] = $word; $word = ""; $p = "B"; } $p1 = $p2; $p2 = $p3; $p3 = $p; $word .= $seg[$i]; } $result[] = $word; return $result; } }
テストしてみる。
$ts = new TinySegmenterarray();
$result = $ts->segment("科学の力ではどうしようもできない、魑魅魍魎などの奇怪な輩に立ち向かう胡散臭い男。");
print_r($result);
結果
Array ( [0] => 科学 [1] => の [2] => 力 [3] => で [4] => は [5] => どう [6] => しよ [7] => う [8] => も [9] => でき [10] => ない [11] => 、 [12] => 魑魅 [13] => 魍魎 [14] => など [15] => の [16] => 奇怪 [17] => な [18] => 輩 [19] => に [20] => 立ち向かう [21] => 胡散 [22] => 臭い [23] => 男 [24] => 。 )
ベンチマーク
オリジナル
Requests per second: 20.63 [#/sec] (mean)
改造版
Requests per second: 22.36 [#/sec] (mean)
秒間2回リクエスト分ぐらい早くなりましたw
TinySegmenterは修正BSDライセンスで、TinySegmenterのphp版のライセンスは独自ライセンス?。自由に改変してもいいのかな。。。私の変更分はNYSLでいいんですが、、オリジナルのライセンスに従ってください。掲示板で作者の方に修正BSDライセンスと伺ったので、修正BSDライセンスでご利用ください。
#補足 オリジナルのライセンスにしたがってというのは、私の修正からみたオリジナルです。だから、php版のTinySegmenterのライセンスに従ってくださいという意味で使いました。言葉足らずだったかもしれないので、補足します。
最後に、TinySegmenterの作者とphp版の作者に感謝します。素晴らしいソフトウェアをありがとう!