TinySegmenterのphpを改造してみる。

TinySegmenter っていうjavascriptで実装されている分かち書きソフトがある。

これの php 版があるんだけど、文字の切り出しに mb_strsub を使っているのと、@でエラー抑制をしていたので少し書き換えてみた。
ついでに、私はUTF-8しか使わない予定なので、専用に改悪してしまったけど、こんな感じになった。

<?php
/*
	PHP Version of TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/)
	TinySegmenter is super compact Japanese tokenizer.

	TinySegmenter was originally developed by Taku Kudo <taku(at)chasen.org>.
	PHP Version was developed by xnights <programming.magic(at)gmail.com>.
	For details, see http://programming-magic.com/?id=172
*/

class TinySegmenterarray{
	private $patterns_ = array(
		"[一二三四五六七八九十百千万億兆]"=>"M",
		"[一-龠々〆ヵヶ]"=>"H",
		"[ぁ-ん]"=>"I",
		"[ァ-ヴーア-ン゙ー]"=>"K",
		"[a-zA-Za-zA-Z]"=>"A",
		"[0-90-9]"=>"N",
	);
	private $BIAS__ = -332;
	private $BC1__ = array("HH"=>6,"II"=>2461,"KH"=>406,"OH"=>-1378);
	private $BC2__ = array("AA"=>-3267,"AI"=>2744,"AN"=>-878,"HH"=>-4070,"HM"=>-1711,"HN"=>4012,"HO"=>3761,"IA"=>1327,"IH"=>-1184,"II"=>-1332,"IK"=>1721,"IO"=>5492,"KI"=>3831,"KK"=>-8741,"MH"=>-3132,"MK"=>3334,"OO"=>-2920);
	private $BC3__ = array("HH"=>996,"HI"=>626,"HK"=>-721,"HN"=>-1307,"HO"=>-836,"IH"=>-301,"KK"=>2762,"MK"=>1079,"MM"=>4034,"OA"=>-1652,"OH"=>266);
	private $BP1__ = array("BB"=>295,"OB"=>304,"OO"=>-125,"UB"=>352);
	private $BP2__ = array("BO"=>60,"OO"=>-1762);
	private $BQ1__ = array("BHH"=>1150,"BHM"=>1521,"BII"=>-1158,"BIM"=>886,"BMH"=>1208,"BNH"=>449,"BOH"=>-91,"BOO"=>-2597,"OHI"=>451,"OIH"=>-296,"OKA"=>1851,"OKH"=>-1020,"OKK"=>904,"OOO"=>2965);
	private $BQ2__ = array("BHH"=>118,"BHI"=>-1159,"BHM"=>466,"BIH"=>-919,"BKK"=>-1720,"BKO"=>864,"OHH"=>-1139,"OHM"=>-181,"OIH"=>153,"UHI"=>-1146);
	private $BQ3__ = array("BHH"=>-792,"BHI"=>2664,"BII"=>-299,"BKI"=>419,"BMH"=>937,"BMM"=>8335,"BNN"=>998,"BOH"=>775,"OHH"=>2174,"OHM"=>439,"OII"=>280,"OKH"=>1798,"OKI"=>-793,"OKO"=>-2242,"OMH"=>-2402,"OOO"=>11699);
	private $BQ4__ = array("BHH"=>-3895,"BIH"=>3761,"BII"=>-4654,"BIK"=>1348,"BKK"=>-1806,"BMI"=>-3385,"BOO"=>-12396,"OAH"=>926,"OHH"=>266,"OHK"=>-2036,"ONN"=>-973);
	private $BW1__ = array(",と"=>660,",同"=>727,"B1あ"=>1404,"B1同"=>542,"、と"=>660,"、同"=>727,"」と"=>1682,"あっ"=>1505,"いう"=>1743,"いっ"=>-2055,"いる"=>672,"うし"=>-4817,"うん"=>665,"から"=>3472,"がら"=>600,"こう"=>-790,"こと"=>2083,"こん"=>-1262,"さら"=>-4143,"さん"=>4573,"した"=>2641,"して"=>1104,"すで"=>-3399,"そこ"=>1977,"それ"=>-871,"たち"=>1122,"ため"=>601,"った"=>3463,"つい"=>-802,"てい"=>805,"てき"=>1249,"でき"=>1127,"です"=>3445,"では"=>844,"とい"=>-4915,"とみ"=>1922,"どこ"=>3887,"ない"=>5713,"なっ"=>3015,"など"=>7379,"なん"=>-1113,"にし"=>2468,"には"=>1498,"にも"=>1671,"に対"=>-912,"の一"=>-501,"の中"=>741,"ませ"=>2448,"まで"=>1711,"まま"=>2600,"まる"=>-2155,"やむ"=>-1947,"よっ"=>-2565,"れた"=>2369,"れで"=>-913,"をし"=>1860,"を見"=>731,"亡く"=>-1886,"京都"=>2558,"取り"=>-2784,"大き"=>-2604,"大阪"=>1497,"平方"=>-2314,"引き"=>-1336,"日本"=>-195,"本当"=>-2423,"毎日"=>-2113,"目指"=>-724,"B1あ"=>1404,"B1同"=>542,"」と"=>1682);
	private $BW2__ = array(".."=>-11822,"11"=>-669,"――"=>-5730,"??"=>-13175,"いう"=>-1609,"うか"=>2490,"かし"=>-1350,"かも"=>-602,"から"=>-7194,"かれ"=>4612,"がい"=>853,"がら"=>-3198,"きた"=>1941,"くな"=>-1597,"こと"=>-8392,"この"=>-4193,"させ"=>4533,"され"=>13168,"さん"=>-3977,"しい"=>-1819,"しか"=>-545,"した"=>5078,"して"=>972,"しな"=>939,"その"=>-3744,"たい"=>-1253,"たた"=>-662,"ただ"=>-3857,"たち"=>-786,"たと"=>1224,"たは"=>-939,"った"=>4589,"って"=>1647,"っと"=>-2094,"てい"=>6144,"てき"=>3640,"てく"=>2551,"ては"=>-3110,"ても"=>-3065,"でい"=>2666,"でき"=>-1528,"でし"=>-3828,"です"=>-4761,"でも"=>-4203,"とい"=>1890,"とこ"=>-1746,"とと"=>-2279,"との"=>720,"とみ"=>5168,"とも"=>-3941,"ない"=>-2488,"なが"=>-1313,"など"=>-6509,"なの"=>2614,"なん"=>3099,"にお"=>-1615,"にし"=>2748,"にな"=>2454,"によ"=>-7236,"に対"=>-14943,"に従"=>-4688,"に関"=>-11388,"のか"=>2093,"ので"=>-7059,"のに"=>-6041,"のの"=>-6125,"はい"=>1073,"はが"=>-1033,"はず"=>-2532,"ばれ"=>1813,"まし"=>-1316,"まで"=>-6621,"まれ"=>5409,"めて"=>-3153,"もい"=>2230,"もの"=>-10713,"らか"=>-944,"らし"=>-1611,"らに"=>-1897,"りし"=>651,"りま"=>1620,"れた"=>4270,"れて"=>849,"れば"=>4114,"ろう"=>6067,"われ"=>7901,"を通"=>-11877,"んだ"=>728,"んな"=>-4115,"一人"=>602,"一方"=>-1375,"一日"=>970,"一部"=>-1051,"上が"=>-4479,"会社"=>-1116,"出て"=>2163,"分の"=>-7758,"同党"=>970,"同日"=>-913,"大阪"=>-2471,"委員"=>-1250,"少な"=>-1050,"年度"=>-8669,"年間"=>-1626,"府県"=>-2363,"手権"=>-1982,"新聞"=>-4066,"日新"=>-722,"日本"=>-7068,"日米"=>3372,"曜日"=>-601,"朝鮮"=>-2355,"本人"=>-2697,"東京"=>-1543,"然と"=>-1384,"社会"=>-1276,"立て"=>-990,"第に"=>-1612,"米国"=>-4268,"11"=>-669);
	private $BW3__ = array("あた"=>-2194,"あり"=>719,"ある"=>3846,"い."=>-1185,"い。"=>-1185,"いい"=>5308,"いえ"=>2079,"いく"=>3029,"いた"=>2056,"いっ"=>1883,"いる"=>5600,"いわ"=>1527,"うち"=>1117,"うと"=>4798,"えと"=>1454,"か."=>2857,"か。"=>2857,"かけ"=>-743,"かっ"=>-4098,"かに"=>-669,"から"=>6520,"かり"=>-2670,"が,"=>1816,"が、"=>1816,"がき"=>-4855,"がけ"=>-1127,"がっ"=>-913,"がら"=>-4977,"がり"=>-2064,"きた"=>1645,"けど"=>1374,"こと"=>7397,"この"=>1542,"ころ"=>-2757,"さい"=>-714,"さを"=>976,"し,"=>1557,"し、"=>1557,"しい"=>-3714,"した"=>3562,"して"=>1449,"しな"=>2608,"しま"=>1200,"す."=>-1310,"す。"=>-1310,"する"=>6521,"ず,"=>3426,"ず、"=>3426,"ずに"=>841,"そう"=>428,"た."=>8875,"た。"=>8875,"たい"=>-594,"たの"=>812,"たり"=>-1183,"たる"=>-853,"だ."=>4098,"だ。"=>4098,"だっ"=>1004,"った"=>-4748,"って"=>300,"てい"=>6240,"てお"=>855,"ても"=>302,"です"=>1437,"でに"=>-1482,"では"=>2295,"とう"=>-1387,"とし"=>2266,"との"=>541,"とも"=>-3543,"どう"=>4664,"ない"=>1796,"なく"=>-903,"など"=>2135,"に,"=>-1021,"に、"=>-1021,"にし"=>1771,"にな"=>1906,"には"=>2644,"の,"=>-724,"の、"=>-724,"の子"=>-1000,"は,"=>1337,"は、"=>1337,"べき"=>2181,"まし"=>1113,"ます"=>6943,"まっ"=>-1549,"まで"=>6154,"まれ"=>-793,"らし"=>1479,"られ"=>6820,"るる"=>3818,"れ,"=>854,"れ、"=>854,"れた"=>1850,"れて"=>1375,"れば"=>-3246,"れる"=>1091,"われ"=>-605,"んだ"=>606,"んで"=>798,"カ月"=>990,"会議"=>860,"入り"=>1232,"大会"=>2217,"始め"=>1681,""=>965,"新聞"=>-5055,"日,"=>974,"日、"=>974,"社会"=>2024,"カ月"=>990);
	private $TC1__ = array("AAA"=>1093,"HHH"=>1029,"HHM"=>580,"HII"=>998,"HOH"=>-390,"HOM"=>-331,"IHI"=>1169,"IOH"=>-142,"IOI"=>-1015,"IOM"=>467,"MMH"=>187,"OOI"=>-1832);
	private $TC2__ = array("HHO"=>2088,"HII"=>-1023,"HMM"=>-1154,"IHI"=>-1965,"KKH"=>703,"OII"=>-2649);
	private $TC3__ = array("AAA"=>-294,"HHH"=>346,"HHI"=>-341,"HII"=>-1088,"HIK"=>731,"HOH"=>-1486,"IHH"=>128,"IHI"=>-3041,"IHO"=>-1935,"IIH"=>-825,"IIM"=>-1035,"IOI"=>-542,"KHH"=>-1216,"KKA"=>491,"KKH"=>-1217,"KOK"=>-1009,"MHH"=>-2694,"MHM"=>-457,"MHO"=>123,"MMH"=>-471,"NNH"=>-1689,"NNO"=>662,"OHO"=>-3393);
	private $TC4__ = array("HHH"=>-203,"HHI"=>1344,"HHK"=>365,"HHM"=>-122,"HHN"=>182,"HHO"=>669,"HIH"=>804,"HII"=>679,"HOH"=>446,"IHH"=>695,"IHO"=>-2324,"IIH"=>321,"III"=>1497,"IIO"=>656,"IOO"=>54,"KAK"=>4845,"KKA"=>3386,"KKK"=>3065,"MHH"=>-405,"MHI"=>201,"MMH"=>-241,"MMM"=>661,"MOM"=>841);
	private $TQ1__ = array("BHHH"=>-227,"BHHI"=>316,"BHIH"=>-132,"BIHH"=>60,"BIII"=>1595,"BNHH"=>-744,"BOHH"=>225,"BOOO"=>-908,"OAKK"=>482,"OHHH"=>281,"OHIH"=>249,"OIHI"=>200,"OIIH"=>-68);
	private $TQ2__ = array("BIHH"=>-1401,"BIII"=>-1033,"BKAK"=>-543,"BOOO"=>-5591);
	private $TQ3__ = array("BHHH"=>478,"BHHM"=>-1073,"BHIH"=>222,"BHII"=>-504,"BIIH"=>-116,"BIII"=>-105,"BMHI"=>-863,"BMHM"=>-464,"BOMH"=>620,"OHHH"=>346,"OHHI"=>1729,"OHII"=>997,"OHMH"=>481,"OIHH"=>623,"OIIH"=>1344,"OKAK"=>2792,"OKHH"=>587,"OKKA"=>679,"OOHH"=>110,"OOII"=>-685);
	private $TQ4__ = array("BHHH"=>-721,"BHHM"=>-3604,"BHII"=>-966,"BIIH"=>-607,"BIII"=>-2181,"OAAA"=>-2763,"OAKK"=>180,"OHHH"=>-294,"OHHI"=>2446,"OHHO"=>480,"OHIH"=>-1573,"OIHH"=>1935,"OIHI"=>-493,"OIIH"=>626,"OIII"=>-4007,"OKAK"=>-8156);
	private $TW1__ = array("につい"=>-4681,"東京都"=>2026);
	private $TW2__ = array("ある程"=>-2049,"いった"=>-1256,"ころが"=>-2434,"しょう"=>3873,"その後"=>-4430,"だって"=>-1049,"ていた"=>1833,"として"=>-4657,"ともに"=>-4517,"もので"=>1882,"一気に"=>-792,"初めて"=>-1512,"同時に"=>-8097,"大きな"=>-1255,"対して"=>-2721,"社会党"=>-3216);
	private $TW3__ = array("いただ"=>-1734,"してい"=>1314,"として"=>-4314,"につい"=>-5483,"にとっ"=>-5989,"に当た"=>-6247,"ので,"=>-727,"ので、"=>-727,"のもの"=>-600,"れから"=>-3752,"十二月"=>-2287);
	private $TW4__ = array("いう."=>8576,"いう。"=>8576,"からな"=>-2348,"してい"=>2958,"たが,"=>1516,"たが、"=>1516,"ている"=>1538,"という"=>1349,"ました"=>5543,"ません"=>1097,"ようと"=>-4258,"よると"=>5865);
	private $UC1__ = array("A"=>484,"K"=>93,"M"=>645,"O"=>-505);
	private $UC2__ = array("A"=>819,"H"=>1059,"I"=>409,"M"=>3987,"N"=>5775,"O"=>646);
	private $UC3__ = array("A"=>-1370,"I"=>2311);
	private $UC4__ = array("A"=>-2643,"H"=>1809,"I"=>-1032,"K"=>-3450,"M"=>3565,"N"=>3876,"O"=>6646);
	private $UC5__ = array("H"=>313,"I"=>-1238,"K"=>-799,"M"=>539,"O"=>-831);
	private $UC6__ = array("H"=>-506,"I"=>-253,"K"=>87,"M"=>247,"O"=>-387);
	private $UP1__ = array("O"=>-214);
	private $UP2__ = array("B"=>69,"O"=>935);
	private $UP3__ = array("B"=>189);
	private $UQ1__ = array("BH"=>21,"BI"=>-12,"BK"=>-99,"BN"=>142,"BO"=>-56,"OH"=>-95,"OI"=>477,"OK"=>410,"OO"=>-2422);
	private $UQ2__ = array("BH"=>216,"BI"=>113,"OK"=>1759);
	private $UQ3__ = array("BA"=>-479,"BH"=>42,"BI"=>1913,"BK"=>-7198,"BM"=>3160,"BN"=>6427,"BO"=>14761,"OI"=>-827,"ON"=>-3212);
	private $UW1__ = array(","=>156,""=>156,""=>-463,""=>-941,""=>-127,""=>-553,""=>121,""=>505,""=>-201,""=>-547,""=>-123,""=>-789,""=>-185,""=>-847,""=>-466,""=>-470,""=>182,""=>-292,""=>208,""=>169,""=>-446,""=>-137,""=>-135,""=>-402,""=>-268,""=>-912,""=>871,""=>-460,""=>561,""=>729,""=>-411,""=>-141,""=>361,""=>-408,""=>-386,""=>-718,""=>-463,""=>-135);
	private $UW2__ = array(","=>-829,""=>-829,""=>892,""=>-645,""=>3145,""=>-538,""=>505,""=>134,""=>-502,""=>1454,""=>-856,""=>-412,""=>1141,""=>878,""=>540,""=>1529,""=>-675,""=>300,""=>-1011,""=>188,""=>1837,""=>-949,""=>-291,""=>-268,""=>-981,""=>1273,""=>1063,""=>-1764,""=>130,""=>-409,""=>-1273,""=>1261,""=>600,""=>-1263,""=>-402,""=>1639,""=>-579,""=>-694,""=>571,""=>-2516,""=>2095,""=>-587,""=>306,""=>568,""=>831,""=>-758,""=>-2150,""=>-302,""=>-968,""=>-861,""=>492,""=>-123,""=>978,""=>362,""=>548,""=>-3025,""=>-1566,""=>-3414,""=>-422,""=>-1769,""=>-865,""=>-483,""=>-1519,""=>760,""=>1023,""=>-2009,""=>-813,""=>-1060,""=>1067,""=>-1519,""=>-1033,""=>1522,""=>-1355,""=>-1682,""=>-1815,""=>-1462,""=>-630,""=>-1843,""=>-1650,""=>-931,""=>-665,""=>-2378,""=>-180,""=>-1740,""=>752,""=>529,""=>-1584,""=>-242,""=>-1165,""=>-763,""=>810,""=>509,""=>-1353,""=>838,"西"=>-744,""=>-3874,"調"=>1010,""=>1198,""=>3041,""=>1758,""=>-1257,""=>-645,""=>3145,""=>831,""=>-587,""=>306,""=>568);
	private $UW3__ = array(","=>4889,"1"=>-800,"?"=>-1723,""=>4889,""=>-2311,""=>5827,""=>2670,""=>-3573,""=>-2696,""=>1006,""=>2342,""=>1983,""=>-4864,""=>-1163,""=>3271,""=>1004,""=>388,""=>401,""=>-3552,""=>-3116,""=>-1058,""=>-395,""=>584,""=>3685,""=>-5228,""=>842,""=>-521,""=>-1444,""=>-1081,""=>6167,""=>2318,""=>1691,""=>-899,""=>-2788,""=>2745,""=>4056,""=>4555,""=>-2171,""=>-1798,""=>1199,""=>-5516,""=>-4384,""=>-120,""=>1205,""=>2323,""=>-788,""=>-202,""=>727,""=>649,""=>5905,""=>2773,""=>-1207,""=>6620,""=>-518,""=>551,""=>1319,""=>874,""=>-1350,""=>521,""=>1109,""=>1591,""=>2201,""=>278,""=>-3794,""=>-1619,""=>-1759,""=>-2087,""=>3815,""=>653,""=>-758,""=>-1193,""=>974,""=>2742,""=>792,""=>1889,""=>-1368,""=>811,""=>4265,""=>-361,""=>-2439,""=>4858,""=>3593,""=>1574,""=>-3030,""=>755,""=>-1880,""=>5807,""=>3095,""=>457,""=>2475,""=>1129,""=>2286,""=>4437,""=>365,""=>-949,""=>-1872,""=>1327,""=>-1038,""=>4646,""=>-2309,""=>-783,""=>-1006,""=>483,""=>1233,""=>3588,""=>-241,""=>3906,""=>-837,""=>4513,""=>642,""=>1389,""=>1219,""=>-241,""=>2016,""=>-1356,""=>-423,""=>-1008,""=>1078,""=>-513,""=>-3102,""=>1155,""=>3197,""=>-1804,""=>2416,""=>-1030,""=>1605,""=>1452,""=>-2352,""=>-3885,""=>1905,""=>-1291,""=>1822,""=>-488,""=>-3973,""=>-2013,""=>-1479,""=>3222,""=>-1489,""=>1764,""=>2099,""=>5792,""=>-661,""=>-1248,""=>-951,""=>-937,""=>4125,""=>360,""=>3094,""=>364,""=>-805,""=>5156,""=>2438,""=>484,""=>2613,""=>-1694,""=>-1073,""=>1868,""=>-495,""=>979,""=>461,""=>-3850,""=>-273,""=>914,""=>1215,""=>7313,""=>-1835,""=>792,""=>6293,""=>-1528,""=>4231,""=>401,""=>-960,""=>1201,""=>7767,""=>3066,""=>3663,""=>1384,""=>-4229,""=>1163,""=>1255,""=>6457,""=>725,""=>-2869,""=>785,""=>1044,"調"=>-562,""=>-733,""=>1777,""=>1835,""=>1375,""=>-1504,""=>-1136,""=>-681,""=>1026,""=>4404,""=>1200,""=>2163,""=>421,""=>-1432,""=>1302,""=>-1282,""=>2009,""=>-1045,""=>2066,""=>1620,""=>-800,""=>2670,""=>-3794,""=>-1350,""=>551,"グ"=>1319,""=>874,""=>521,""=>1109,""=>1591,""=>2201,""=>278);
	private $UW4__ = array(","=>3930,"."=>3508,""=>-4841,""=>3930,""=>3508,""=>4999,""=>1895,""=>3798,""=>-5156,""=>4752,""=>-3435,""=>-640,""=>-2514,""=>2405,""=>530,""=>6006,""=>-4482,""=>-3821,""=>-3788,""=>-4376,""=>-4734,""=>2255,""=>1979,""=>2864,""=>-843,""=>-2506,""=>-731,""=>1251,""=>181,""=>4091,""=>5034,""=>5408,""=>-3654,""=>-5882,""=>-1659,""=>3994,""=>7410,""=>4547,""=>5433,""=>6499,""=>1853,""=>1413,""=>7396,""=>8578,""=>1940,""=>4249,""=>-4134,""=>1345,""=>6665,""=>-744,""=>1464,""=>1051,""=>-2082,""=>-882,""=>-5046,""=>4169,""=>-2666,""=>2795,""=>-1544,""=>3351,""=>-2922,""=>-9726,""=>-14896,""=>-2613,""=>-4570,""=>-1783,""=>13150,""=>-2352,""=>2145,""=>1789,""=>1287,""=>-724,""=>-403,""=>-1635,""=>-881,""=>-541,""=>-856,""=>-3637,""=>-4371,""=>-11870,""=>-2069,""=>2210,""=>782,""=>-190,""=>-1768,""=>1036,""=>544,""=>950,""=>-1286,""=>530,""=>4292,""=>601,""=>-2006,""=>-1212,""=>584,""=>788,""=>1347,""=>1623,""=>3879,""=>-302,""=>-740,""=>-2715,""=>776,""=>4517,""=>1013,""=>1555,""=>-1834,""=>-681,""=>-910,""=>-851,""=>1500,""=>-619,""=>-1200,""=>866,""=>-1410,""=>-2094,""=>-1413,""=>1067,""=>571,""=>-4802,""=>-1397,""=>-1057,""=>-809,""=>1910,""=>-1328,""=>-1500,""=>-2056,""=>-2667,""=>2771,""=>374,""=>-4556,""=>456,""=>553,""=>916,""=>-1566,""=>856,""=>787,""=>2182,""=>704,""=>522,""=>-856,""=>1798,""=>1829,""=>845,""=>-9066,""=>-485,""=>-442,""=>-360,""=>-1043,""=>5388,""=>-2716,""=>-910,""=>-939,""=>-543,""=>-735,""=>672,""=>-1267,""=>-1286,""=>-1101,""=>-2900,""=>1826,""=>2586,""=>922,""=>-3485,""=>2997,""=>-867,""=>-2112,""=>788,""=>2937,""=>786,""=>2171,""=>1146,""=>-1169,""=>940,""=>-994,""=>749,""=>2145,""=>-730,""=>-852,""=>-792,""=>792,""=>-1184,""=>-244,""=>-1000,""=>730,""=>-1481,""=>1158,""=>-1433,""=>-3370,""=>929,""=>-1291,""=>2596,""=>-4866,""=>1192,""=>-1100,""=>-2213,""=>357,""=>-2344,""=>-2297,""=>-2604,""=>-878,""=>-1659,""=>-792,""=>-1984,""=>1749,""=>2120,""=>1895,""=>3798,""=>-4371,""=>-724,""=>-11870,""=>2145,""=>1789,""=>1287,""=>-403,""=>-1635,""=>-881,""=>-541,""=>-856,""=>-3637);
	private $UW5__ = array(","=>465,"."=>-299,"1"=>-514,"E2"=>-32768,"]"=>-2762,""=>465,""=>-299,""=>363,""=>1655,""=>331,""=>-503,""=>1199,""=>527,""=>647,""=>-421,""=>1624,""=>1971,""=>312,""=>-983,""=>-1537,""=>-1371,""=>-852,""=>-1186,""=>1093,""=>52,""=>921,""=>-18,""=>-850,""=>-127,""=>1682,""=>-787,""=>-1224,""=>-635,""=>-578,""=>1001,""=>502,""=>865,""=>3350,""=>854,""=>-208,""=>429,""=>504,""=>419,""=>-1264,""=>327,""=>241,""=>451,""=>-343,""=>-871,""=>722,""=>-1153,""=>-654,""=>3519,""=>-901,""=>848,""=>2104,""=>-1296,""=>-548,""=>1785,""=>-1304,""=>-2991,""=>921,""=>1763,""=>872,""=>-814,""=>1618,""=>-1682,""=>218,""=>-4353,""=>932,""=>1356,""=>-1508,""=>-1347,""=>240,""=>-3912,""=>-3149,""=>1319,""=>-1052,""=>-4003,""=>-997,""=>-278,""=>-813,""=>1955,""=>-2233,""=>663,""=>-1073,""=>1219,""=>-1018,""=>-368,""=>786,""=>1191,""=>2368,""=>-689,""=>-514,"E2"=>-32768,""=>363,""=>241,""=>451,""=>-343);
	private $UW6__ = array(","=>227,"."=>808,"1"=>-270,"E1"=>306,""=>227,""=>808,""=>-307,""=>189,""=>241,""=>-73,""=>-121,""=>-200,""=>1782,""=>383,""=>-428,""=>573,""=>-1014,""=>101,""=>-105,""=>-253,""=>-149,""=>-417,""=>-236,""=>-206,""=>187,""=>-135,""=>195,""=>-673,""=>-496,""=>-277,""=>201,""=>-800,""=>624,""=>302,""=>1792,""=>-1212,""=>798,""=>-960,""=>887,""=>-695,""=>535,""=>-697,""=>753,""=>-507,""=>974,""=>-822,""=>1811,""=>463,""=>1082,""=>-270,"E1"=>306,""=>-673,""=>-496);

	private function ctype_($str){
		foreach($this->patterns_ as $pattern => $type){
			if(preg_match('/'.$pattern.'/u', $str)){
				return $type;
			}
		}
		return "O";
	}

	private function ts_(&$v){
		if(isset($v) && $v){return $v;}
		return 0;
	}

	public function segment($input){
		if(!$input){
			return array();
		}

		$result = array();
		$seg = array("B3","B2","B1");
		$ctype = array("O","O","O");
		$o = preg_split("//u", $input, -1, PREG_SPLIT_NO_EMPTY);
		for($i = 0; $i<count($o); ++$i) {
			$seg[] = $o[$i];
			$ctype[] = $this->ctype_($o[$i]);
		}
		$seg[] = "E1";
		$seg[] = "E2";
		$seg[] = "E3";
		$ctype[] = "O";
		$ctype[] = "O";
		$ctype[] = "O";
		$word = $seg[3];
		$p1 = "U";
		$p2 = "U";
		$p3 = "U";
		for($i = 4; $i<count($seg)-3; ++$i){
			$score = $this->BIAS__;
			$w1 = $seg[$i-3];
			$w2 = $seg[$i-2];
			$w3 = $seg[$i-1];
			$w4 = $seg[$i];
			$w5 = $seg[$i+1];
			$w6 = $seg[$i+2];
			$c1 = $ctype[$i-3];
			$c2 = $ctype[$i-2];
			$c3 = $ctype[$i-1];
			$c4 = $ctype[$i];
			$c5 = $ctype[$i+1];
			$c6 = $ctype[$i+2];
			$score += $this->ts_($this->UP1__[$p1]);
			$score += $this->ts_($this->UP2__[$p2]);
			$score += $this->ts_($this->UP3__[$p3]);
			$score += $this->ts_($this->BP1__[$p1 . $p2]);
			$score += $this->ts_($this->BP2__[$p2 . $p3]);
			$score += $this->ts_($this->UW1__[$w1]);
			$score += $this->ts_($this->UW2__[$w2]);
			$score += $this->ts_($this->UW3__[$w3]);
			$score += $this->ts_($this->UW4__[$w4]);
			$score += $this->ts_($this->UW5__[$w5]);
			$score += $this->ts_($this->UW6__[$w6]);
			$score += $this->ts_($this->BW1__[$w2 . $w3]);
			$score += $this->ts_($this->BW2__[$w3 . $w4]);
			$score += $this->ts_($this->BW3__[$w4 . $w5]);
			$score += $this->ts_($this->TW1__[$w1 . $w2 . $w3]);
			$score += $this->ts_($this->TW2__[$w2 . $w3 . $w4]);
			$score += $this->ts_($this->TW3__[$w3 . $w4 . $w5]);
			$score += $this->ts_($this->TW4__[$w4 . $w5 . $w6]);
			$score += $this->ts_($this->UC1__[$c1]);
			$score += $this->ts_($this->UC2__[$c2]);
			$score += $this->ts_($this->UC3__[$c3]);
			$score += $this->ts_($this->UC4__[$c4]);
			$score += $this->ts_($this->UC5__[$c5]);
			$score += $this->ts_($this->UC6__[$c6]);
			$score += $this->ts_($this->BC1__[$c2 . $c3]);
			$score += $this->ts_($this->BC2__[$c3 . $c4]);
			$score += $this->ts_($this->BC3__[$c4 . $c5]);
			$score += $this->ts_($this->TC1__[$c1 . $c2 . $c3]);
			$score += $this->ts_($this->TC2__[$c2 . $c3 . $c4]);
			$score += $this->ts_($this->TC3__[$c3 . $c4 . $c5]);
			$score += $this->ts_($this->TC4__[$c4 . $c5 . $c6]);
			//  $score += $this->ts_($this->TC5__[$c4 . $c5 . $c6]);
			$score += $this->ts_($this->UQ1__[$p1 . $c1]);
			$score += $this->ts_($this->UQ2__[$p2 . $c2]);
			$score += $this->ts_($this->UQ1__[$p3 . $c3]);
			$score += $this->ts_($this->BQ1__[$p2 . $c2 . $c3]);
			$score += $this->ts_($this->BQ2__[$p2 . $c3 . $c4]);
			$score += $this->ts_($this->BQ3__[$p3 . $c2 . $c3]);
			$score += $this->ts_($this->BQ4__[$p3 . $c3 . $c4]);
			$score += $this->ts_($this->TQ1__[$p2 . $c1 . $c2 . $c3]);
			$score += $this->ts_($this->TQ2__[$p2 . $c2 . $c3 . $c4]);
			$score += $this->ts_($this->TQ3__[$p3 . $c1 . $c2 . $c3]);
			$score += $this->ts_($this->TQ4__[$p3 . $c2 . $c3 . $c4]);
			$p = "O";
			if($score > 0){
				$result[] = $word;
				$word = "";
				$p = "B";
			}
			$p1 = $p2;
			$p2 = $p3;
			$p3 = $p;
			$word .= $seg[$i];
		}
		$result[] = $word;
		return $result;
	}
}

ダウンロードしたい人はこちら

テストしてみる。

$ts = new TinySegmenterarray();
$result = $ts->segment("科学の力ではどうしようもできない、魑魅魍魎などの奇怪な輩に立ち向かう胡散臭い男。");
print_r($result);

結果

Array ( [0] => 科学 [1] => の [2] => 力 [3] => で [4] => は [5] => どう [6] => しよ [7] => う [8] => も [9] => でき [10] => ない [11] => 、 [12] => 魑魅 [13] => 魍魎 [14] => など [15] => の [16] => 奇怪 [17] => な [18] => 輩 [19] => に [20] => 立ち向かう [21] => 胡散 [22] => 臭い [23] => 男 [24] => 。 ) 

ベンチマーク
オリジナル
Requests per second: 20.63 [#/sec] (mean)
改造版
Requests per second: 22.36 [#/sec] (mean)

秒間2回リクエスト分ぐらい早くなりましたw

TinySegmenterは修正BSDライセンスで、TinySegmenterのphp版のライセンスは独自ライセンス?。自由に改変してもいいのかな。。。私の変更分はNYSLでいいんですが、、オリジナルのライセンスに従ってください。掲示板で作者の方に修正BSDライセンスと伺ったので、修正BSDライセンスでご利用ください。

#補足 オリジナルのライセンスにしたがってというのは、私の修正からみたオリジナルです。だから、php版のTinySegmenterのライセンスに従ってくださいという意味で使いました。言葉足らずだったかもしれないので、補足します。

最後に、TinySegmenterの作者とphp版の作者に感謝します。素晴らしいソフトウェアをありがとう!