#!/usr/bin/perl
###############################################################################
#
# deftest.pl Define SpeechDat FDB test set by a controlled
# random selection, in which every accent region and
# gender is treated separately.
#
# Input file: session.tbl
#
# Output files:
trn.ses, tst.ses
#
# Version 1.1 Finn Tore Johansen, Telenor R&D, 09.12.97
#
###############################################################################
# Configuration variables
# desired testset size, this is a function of the database size
$N = 200; # For 1000- databases
# $N = 500; # For 2000+ databases
# database and language code
$DD = "a1";
$LL = "no";
# input file, you may need to edit the field description line below
# if more than the mandatory fields are present, or if any of
# the SEX or ACC fields are empty
$sesfile = "session.tbl";
# output files
$trnsetfile = "${DD}trn$LL.ses";
$tstsetfile = "${DD}tst$LL.ses";
# Load session file, compute the total number of speakers in each class
open(FILE,$sesfile) || die "Could not open $sesfile";
while (){
next if /^SES/; # Ignore header
s/\r\n$//;
($SES,$RED,$RET,$AGE,$SEX,$ACC,$REG,$ENV) = split(/\t/,$_);
$class{$SES} = "$SEX$ACC";
$count{$class{$SES}}++;
}
close(FILE);
@sessions = sort numerically keys(%class);
$F = $#sessions + 1;
@classes = keys(%count);
$K = $#classes + 1;
printf("Desired testset size N=%d\n",$N);
printf("Total database size F=%d\n",$F);
printf("Number of classes in selection K=%d\n\n",$K);
# Generate optimum (non integer) speaker distribution
foreach $bin (@classes){
$opt{$bin} = $count{$bin}*$N/$F;
$diff{$bin} = $count{$bin}-$opt{$bin};
}
printf("Initial speaker distribution:\n");
&printtab;
# Find testset with integer number of speakers
srand(0); # Use same seed to get same results every time
printf("Generating testset distribution...\n");
while (&sumcount > $N){
# Select the class with largest difference between #speakers and optimum
@sorted = sort bydiff (@classes);
$top = $sorted[0];
# Remove a random speaker from the selected class
@tmp = ();
foreach $ses (@sessions){
push(@tmp,$ses) if ( !($trainset{$ses}) && ($class{$ses} eq $top));
}
$ses = $tmp[rand(@tmp)];
$trainset{$ses} = 1;
$count{$top}--;
$diff{$top} = $count{$top} - $opt{$top};
}
&printtab;
# Save train and testset to list files
open(TRNFILE,">$trnsetfile") || die "Can't open $trnsetfile";
open(TSTFILE,">$tstsetfile") || die "Can't open $tstsetfile";
foreach $ses (@sessions){
if ($trainset{$ses}) {
printf(TRNFILE "$ses\r\n");
}
else {
printf(TSTFILE "$ses\r\n");
}
}
close(TRNFILE);
close(TSTFILE);
printf("Trainset saved to %s\n",$trnsetfile);
printf("Testset saved to %s\n",$tstsetfile);
# Subroutines
sub bydiff { $diff{$b} <=> $diff{$a} }
sub numerically { $a <=> $b }
sub sumcount {
$acc = 0;
foreach $bin (@classes){
$acc += $count{$bin};
}
return $acc;
}
sub printtab {
printf("\n%s\t%s\t%s\t\t%s\n","CLASS","#SPKS","OPT","DIFF");
foreach $bin (sort (@classes)){
printf("%s\t%d\t%f\t%f\n",$bin,$count{$bin},$opt{$bin},$diff{$bin});
}
printf("%s\t%d\n\n","SUM",&sumcount);
}