感想文ランダム生成ソース修正版

当時書いたコードをその後改良したもの

mc.h

ツリー状のデータで文字のつながりを表現。

a+h+a
 | -o
 +s-a

前一文字がaのとき次にhやsが出た回数とか、前二文字がahのときに次にaやoがでた回数を記録。

class MCDat
{
  public:
  int val; //文字の値
  int hist; // 出現回数
  class MCDat *childRoot;//次世代代表へのポインタ
  class MCDat *nextLT;   // 同世代兄弟へのポインタ。二分木で管理
  class MCDat *nextGT;   // 〃

  int part_sum; //二分木で自分より左にある同世代のhist 合計
  void set_part_sum();

  MCDat(int c0); //初期化
  
  class MCDat *find_child(int c0); //文字c0の子供を探す。なければNULL
  class MCDat *add_child(int c0);  //文字c0の子供をツリーに追加。頻度を+1
  class MCDat *random_child();     //頻度に応じて子供をランダム選択

};

mc.cc

#include <stdlib.h>
#include "mc.h"

MCDat::MCDat(int c0)
{
    val = c0;
    hist = 0;
    childRoot = NULL;
    nextLT = NULL;
    nextGT = NULL;

    part_sum = -1;
}

MCDat *MCDat::find_child(int c0)
{
    MCDat *p = childRoot;

    while(p!=NULL)
    {
	if (p->val == c0)
	    break;
	else if(p->val < c0)
	    p = p->nextLT;
	else
	    p = p->nextGT;
    }
    return p;
}

MCDat *MCDat::add_child(int c0)
{
    MCDat *p = childRoot;
    MCDat **prev = &childRoot;

    while(p!=NULL)
    {
	if (p->val == c0)
	    break;
	else if(p->val < c0)
	    prev = &(p->nextLT);
	else
	    prev = &(p->nextGT);

	p = *prev;
    }
    if (!p)
    {
	p=new MCDat(c0);
	*prev = p;
    }
    p->hist++;
    return p;
}

void MCDat::set_part_sum()
{
    MCDat *p;
    int sum = hist;

    p = nextLT;
    if (p)
    {
	p->set_part_sum();
	sum += p->part_sum;
    }

    p = nextGT;
    if (p)
    {
	p->set_part_sum();
	sum += p->part_sum;
    }

    part_sum = sum;
}

MCDat *MCDat::random_child()
{
    MCDat *p = childRoot;

    if (!p) return NULL;

    if (p->part_sum == -1)
	p->set_part_sum();

    double r0 = rand()*1.0/RAND_MAX;
    int r = (int)(r0 * p->part_sum);

    //printf("\nSelect %d/%d\n",r, p->part_sum);

    while(p != NULL)
    {
        MCDat *lt = p->nextLT;
        int lsum = 0;
	if (lt) lsum = lt->part_sum;
	if (r < lsum)
	{
	    p = lt;
	}
	else if(r < lsum + p->hist) break;
	else
	{
	    r -= lsum + p->hist;
            p = p->nextGT;
	}
    }
    return p;
}

四文字前まで参考にするコード

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "mc.h"

//漢字処理(EUC)
int kidx(unsigned char c1, unsigned char c2)
{
    if (c1<0xa1 || c1>0xf3 || c2<0xa0) return -1;

    return 256+(c1-0xa1)*(128-32)+(c2-0xa0);
}

int kidxs(const char *c)
{
    return kidx((unsigned char)c[0],(unsigned char)c[1]);
}

void prtkidx(int idx)
{
    if (idx<256)
    {
	putchar(idx);
	return;
    }
    idx -= 256;
    //putchar('[');
    putchar(0xa1+idx/(128-32));
    putchar(0xa0+idx%(128-32));
    //putchar(']');
}

int getwchar(unsigned char **c)
{
    unsigned char c1 = **c;
    unsigned char c2 = *((*c)+1);
    if(c1<32 || c2<32) return -1;
    if (c1<128)
    {
	(*c)++;
	return c1;
    }
    else
    {
	*c+=2;
	return  kidx(c1,c2);
    }
}

int  main(int argc, char **argv)
{
    MCDat *root = new MCDat(0);
    unsigned char buf[10010];

    int kmax = kidx(0xf3,0xff);
    int i,j,k;
    FILE *fp = stdin;
    //文頭出現頻度
    int *tophist =(int *)malloc(sizeof(int)*kmax);
    int tophist_sum = 0;
    for (i=0;i<kmax;i++) tophist[i]=0;

    MCDat *p1, *p2, *p3, *p4;

    //元データ読み込み
    while(1)
    {
	unsigned char *c = buf;
	fgets((char *)buf,10000, fp);
	if(feof(fp))break;
	if (strncmp((char *)buf,"XXXX",4)==0)break;
        p1=p2=p3=p4 = NULL;

	while(1)
	{
	    int c0 = getwchar(&c);
	    if (c0 == -1) break;

	    if (!p1)
	    {
                tophist[c0]++;
		tophist_sum++;
	    }
#if 0 //二文字以前無視の場合
	    if(p4) p4->add_child(c0);
	    if(p3) p4 = p3->add_child(c0);
	    if(p2) p3 = p2->add_child(c0);
#endif
	    if(p2) p2->add_child(c0);
	    if(p1) p2 = p1->add_child(c0);
            p1 = root->add_child(c0);

	    //文末処理
	    if (c0 == kidxs("。") ||
		c0 == kidxs("．") ||
		c0 == kidxs("！") ||
		c0 == kidxs("？") )
	    {
		p1=p2=p3=p4=NULL;
	    }
#if 0
	    else
	    if (c0 == kidxs("『") ||
		c0 == kidxs("「") ||
		c0 == kidxs("」") ||
		c0 == kidxs("』") )
	    {

		p2=p3=p4=NULL;
	    }
#endif
	}//buf
    }

    //ランダム生成
    for(i=0;i<1000;i++)
    {
	//文頭一文字決定
	double r=1.0*rand()/RAND_MAX;
	int ri = (int)(tophist_sum*r);
	int c0 = 0;
	int len=0;
        while(c0<kmax)
	{
	    ri -= tophist[c0];
	    if (ri<=0)break;
	    c0++;
	}

	prtkidx(c0);
        p1 = root->find_child(c0);
	p2 = p3 = p4 = NULL;

	//二文字目以降
	while(len<200)
	{
	    MCDat *p;

	    len++;
#if 0
	    if (p4) 
		p = p4->random_child();
	    else
	    if (p3) 
		p = p3->random_child();
	    else
#endif
	    if (p2) 
		p = p2->random_child();
	    else
	    if (p1)
		p = p1->random_child();
	    else
		break;
	    if (!p) break;
	    c0 = p->val;
            prtkidx(c0);

	    if (p3) p4 = p3->find_child(c0);
	    if (p2) p3 = p2->find_child(c0);
	    if (p1) p2 = p1->find_child(c0);
	    p1 = root->find_child(c0);
	}	
	
	putchar(10);
    }
}