当時書いたコードをその後改良したもの
mc.h
- ツリー状のデータで文字のつながりを表現。
a+h+a | -o +s-a
前一文字がaのとき次にhやsが出た回数とか、前二文字がahのときに次にaやoがでた回数を記録。
class MCDat { public: int val; //文字の値 int hist; // 出現回数 class MCDat *childRoot;//次世代代表へのポインタ class MCDat *nextLT; // 同世代兄弟へのポインタ。二分木で管理 class MCDat *nextGT; // 〃 int part_sum; //二分木で自分より左にある同世代のhist 合計 void set_part_sum(); MCDat(int c0); //初期化 class MCDat *find_child(int c0); //文字c0の子供を探す。なければNULL class MCDat *add_child(int c0); //文字c0の子供をツリーに追加。頻度を+1 class MCDat *random_child(); //頻度に応じて子供をランダム選択 };
mc.cc
#include <stdlib.h> #include "mc.h" MCDat::MCDat(int c0) { val = c0; hist = 0; childRoot = NULL; nextLT = NULL; nextGT = NULL; part_sum = -1; } MCDat *MCDat::find_child(int c0) { MCDat *p = childRoot; while(p!=NULL) { if (p->val == c0) break; else if(p->val < c0) p = p->nextLT; else p = p->nextGT; } return p; } MCDat *MCDat::add_child(int c0) { MCDat *p = childRoot; MCDat **prev = &childRoot; while(p!=NULL) { if (p->val == c0) break; else if(p->val < c0) prev = &(p->nextLT); else prev = &(p->nextGT); p = *prev; } if (!p) { p=new MCDat(c0); *prev = p; } p->hist++; return p; } void MCDat::set_part_sum() { MCDat *p; int sum = hist; p = nextLT; if (p) { p->set_part_sum(); sum += p->part_sum; } p = nextGT; if (p) { p->set_part_sum(); sum += p->part_sum; } part_sum = sum; } MCDat *MCDat::random_child() { MCDat *p = childRoot; if (!p) return NULL; if (p->part_sum == -1) p->set_part_sum(); double r0 = rand()*1.0/RAND_MAX; int r = (int)(r0 * p->part_sum); //printf("\nSelect %d/%d\n",r, p->part_sum); while(p != NULL) { MCDat *lt = p->nextLT; int lsum = 0; if (lt) lsum = lt->part_sum; if (r < lsum) { p = lt; } else if(r < lsum + p->hist) break; else { r -= lsum + p->hist; p = p->nextGT; } } return p; }
四文字前まで参考にするコード
#include <stdio.h> #include <stdlib.h> #include <string.h> #include "mc.h" //漢字処理(EUC) int kidx(unsigned char c1, unsigned char c2) { if (c1<0xa1 || c1>0xf3 || c2<0xa0) return -1; return 256+(c1-0xa1)*(128-32)+(c2-0xa0); } int kidxs(const char *c) { return kidx((unsigned char)c[0],(unsigned char)c[1]); } void prtkidx(int idx) { if (idx<256) { putchar(idx); return; } idx -= 256; //putchar('['); putchar(0xa1+idx/(128-32)); putchar(0xa0+idx%(128-32)); //putchar(']'); } int getwchar(unsigned char **c) { unsigned char c1 = **c; unsigned char c2 = *((*c)+1); if(c1<32 || c2<32) return -1; if (c1<128) { (*c)++; return c1; } else { *c+=2; return kidx(c1,c2); } } int main(int argc, char **argv) { MCDat *root = new MCDat(0); unsigned char buf[10010]; int kmax = kidx(0xf3,0xff); int i,j,k; FILE *fp = stdin; //文頭出現頻度 int *tophist =(int *)malloc(sizeof(int)*kmax); int tophist_sum = 0; for (i=0;i<kmax;i++) tophist[i]=0; MCDat *p1, *p2, *p3, *p4; //元データ読み込み while(1) { unsigned char *c = buf; fgets((char *)buf,10000, fp); if(feof(fp))break; if (strncmp((char *)buf,"XXXX",4)==0)break; p1=p2=p3=p4 = NULL; while(1) { int c0 = getwchar(&c); if (c0 == -1) break; if (!p1) { tophist[c0]++; tophist_sum++; } #if 0 //二文字以前無視の場合 if(p4) p4->add_child(c0); if(p3) p4 = p3->add_child(c0); if(p2) p3 = p2->add_child(c0); #endif if(p2) p2->add_child(c0); if(p1) p2 = p1->add_child(c0); p1 = root->add_child(c0); //文末処理 if (c0 == kidxs("。") || c0 == kidxs(".") || c0 == kidxs("!") || c0 == kidxs("?") ) { p1=p2=p3=p4=NULL; } #if 0 else if (c0 == kidxs("『") || c0 == kidxs("「") || c0 == kidxs("」") || c0 == kidxs("』") ) { p2=p3=p4=NULL; } #endif }//buf } //ランダム生成 for(i=0;i<1000;i++) { //文頭一文字決定 double r=1.0*rand()/RAND_MAX; int ri = (int)(tophist_sum*r); int c0 = 0; int len=0; while(c0<kmax) { ri -= tophist[c0]; if (ri<=0)break; c0++; } prtkidx(c0); p1 = root->find_child(c0); p2 = p3 = p4 = NULL; //二文字目以降 while(len<200) { MCDat *p; len++; #if 0 if (p4) p = p4->random_child(); else if (p3) p = p3->random_child(); else #endif if (p2) p = p2->random_child(); else if (p1) p = p1->random_child(); else break; if (!p) break; c0 = p->val; prtkidx(c0); if (p3) p4 = p3->find_child(c0); if (p2) p3 = p2->find_child(c0); if (p1) p2 = p1->find_child(c0); p1 = root->find_child(c0); } putchar(10); } }