当時書いたコードをその後改良したもの
mc.h
- ツリー状のデータで文字のつながりを表現。
a+h+a | -o +s-a
前一文字がaのとき次にhやsが出た回数とか、前二文字がahのときに次にaやoがでた回数を記録。
class MCDat
{
public:
int val; //文字の値
int hist; // 出現回数
class MCDat *childRoot;//次世代代表へのポインタ
class MCDat *nextLT; // 同世代兄弟へのポインタ。二分木で管理
class MCDat *nextGT; // 〃
int part_sum; //二分木で自分より左にある同世代のhist 合計
void set_part_sum();
MCDat(int c0); //初期化
class MCDat *find_child(int c0); //文字c0の子供を探す。なければNULL
class MCDat *add_child(int c0); //文字c0の子供をツリーに追加。頻度を+1
class MCDat *random_child(); //頻度に応じて子供をランダム選択
};
mc.cc
#include <stdlib.h> #include "mc.h" MCDat::MCDat(int c0) { val = c0; hist = 0; childRoot = NULL; nextLT = NULL; nextGT = NULL; part_sum = -1; } MCDat *MCDat::find_child(int c0) { MCDat *p = childRoot; while(p!=NULL) { if (p->val == c0) break; else if(p->val < c0) p = p->nextLT; else p = p->nextGT; } return p; } MCDat *MCDat::add_child(int c0) { MCDat *p = childRoot; MCDat **prev = &childRoot; while(p!=NULL) { if (p->val == c0) break; else if(p->val < c0) prev = &(p->nextLT); else prev = &(p->nextGT); p = *prev; } if (!p) { p=new MCDat(c0); *prev = p; } p->hist++; return p; } void MCDat::set_part_sum() { MCDat *p; int sum = hist; p = nextLT; if (p) { p->set_part_sum(); sum += p->part_sum; } p = nextGT; if (p) { p->set_part_sum(); sum += p->part_sum; } part_sum = sum; } MCDat *MCDat::random_child() { MCDat *p = childRoot; if (!p) return NULL; if (p->part_sum == -1) p->set_part_sum(); double r0 = rand()*1.0/RAND_MAX; int r = (int)(r0 * p->part_sum); //printf("\nSelect %d/%d\n",r, p->part_sum); while(p != NULL) { MCDat *lt = p->nextLT; int lsum = 0; if (lt) lsum = lt->part_sum; if (r < lsum) { p = lt; } else if(r < lsum + p->hist) break; else { r -= lsum + p->hist; p = p->nextGT; } } return p; }
四文字前まで参考にするコード
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mc.h"
//漢字処理(EUC)
int kidx(unsigned char c1, unsigned char c2)
{
if (c1<0xa1 || c1>0xf3 || c2<0xa0) return -1;
return 256+(c1-0xa1)*(128-32)+(c2-0xa0);
}
int kidxs(const char *c)
{
return kidx((unsigned char)c[0],(unsigned char)c[1]);
}
void prtkidx(int idx)
{
if (idx<256)
{
putchar(idx);
return;
}
idx -= 256;
//putchar('[');
putchar(0xa1+idx/(128-32));
putchar(0xa0+idx%(128-32));
//putchar(']');
}
int getwchar(unsigned char **c)
{
unsigned char c1 = **c;
unsigned char c2 = *((*c)+1);
if(c1<32 || c2<32) return -1;
if (c1<128)
{
(*c)++;
return c1;
}
else
{
*c+=2;
return kidx(c1,c2);
}
}
int main(int argc, char **argv)
{
MCDat *root = new MCDat(0);
unsigned char buf[10010];
int kmax = kidx(0xf3,0xff);
int i,j,k;
FILE *fp = stdin;
//文頭出現頻度
int *tophist =(int *)malloc(sizeof(int)*kmax);
int tophist_sum = 0;
for (i=0;i<kmax;i++) tophist[i]=0;
MCDat *p1, *p2, *p3, *p4;
//元データ読み込み
while(1)
{
unsigned char *c = buf;
fgets((char *)buf,10000, fp);
if(feof(fp))break;
if (strncmp((char *)buf,"XXXX",4)==0)break;
p1=p2=p3=p4 = NULL;
while(1)
{
int c0 = getwchar(&c);
if (c0 == -1) break;
if (!p1)
{
tophist[c0]++;
tophist_sum++;
}
#if 0 //二文字以前無視の場合
if(p4) p4->add_child(c0);
if(p3) p4 = p3->add_child(c0);
if(p2) p3 = p2->add_child(c0);
#endif
if(p2) p2->add_child(c0);
if(p1) p2 = p1->add_child(c0);
p1 = root->add_child(c0);
//文末処理
if (c0 == kidxs("。") ||
c0 == kidxs(".") ||
c0 == kidxs("!") ||
c0 == kidxs("?") )
{
p1=p2=p3=p4=NULL;
}
#if 0
else
if (c0 == kidxs("『") ||
c0 == kidxs("「") ||
c0 == kidxs("」") ||
c0 == kidxs("』") )
{
p2=p3=p4=NULL;
}
#endif
}//buf
}
//ランダム生成
for(i=0;i<1000;i++)
{
//文頭一文字決定
double r=1.0*rand()/RAND_MAX;
int ri = (int)(tophist_sum*r);
int c0 = 0;
int len=0;
while(c0<kmax)
{
ri -= tophist[c0];
if (ri<=0)break;
c0++;
}
prtkidx(c0);
p1 = root->find_child(c0);
p2 = p3 = p4 = NULL;
//二文字目以降
while(len<200)
{
MCDat *p;
len++;
#if 0
if (p4)
p = p4->random_child();
else
if (p3)
p = p3->random_child();
else
#endif
if (p2)
p = p2->random_child();
else
if (p1)
p = p1->random_child();
else
break;
if (!p) break;
c0 = p->val;
prtkidx(c0);
if (p3) p4 = p3->find_child(c0);
if (p2) p3 = p2->find_child(c0);
if (p1) p2 = p1->find_child(c0);
p1 = root->find_child(c0);
}
putchar(10);
}
}