#!/usr/bin/perl ############################################################################### # # deftest.pl Define SpeechDat FDB test set by a controlled # random selection, in which every accent region and # gender is treated separately. # # Input file: session.tbl # # Output files:
trn.ses,
tst.ses # # Version 1.1 Finn Tore Johansen, Telenor R&D, 09.12.97 # ############################################################################### # Configuration variables # desired testset size, this is a function of the database size $N = 200; # For 1000- databases # $N = 500; # For 2000+ databases # database and language code $DD = "a1"; $LL = "no"; # input file, you may need to edit the field description line below # if more than the mandatory fields are present, or if any of # the SEX or ACC fields are empty $sesfile = "session.tbl"; # output files $trnsetfile = "${DD}trn$LL.ses"; $tstsetfile = "${DD}tst$LL.ses"; # Load session file, compute the total number of speakers in each class open(FILE,$sesfile) || die "Could not open $sesfile"; while (){ next if /^SES/; # Ignore header s/\r\n$//; ($SES,$RED,$RET,$AGE,$SEX,$ACC,$REG,$ENV) = split(/\t/,$_); $class{$SES} = "$SEX$ACC"; $count{$class{$SES}}++; } close(FILE); @sessions = sort numerically keys(%class); $F = $#sessions + 1; @classes = keys(%count); $K = $#classes + 1; printf("Desired testset size N=%d\n",$N); printf("Total database size F=%d\n",$F); printf("Number of classes in selection K=%d\n\n",$K); # Generate optimum (non integer) speaker distribution foreach $bin (@classes){ $opt{$bin} = $count{$bin}*$N/$F; $diff{$bin} = $count{$bin}-$opt{$bin}; } printf("Initial speaker distribution:\n"); &printtab; # Find testset with integer number of speakers srand(0); # Use same seed to get same results every time printf("Generating testset distribution...\n"); while (&sumcount > $N){ # Select the class with largest difference between #speakers and optimum @sorted = sort bydiff (@classes); $top = $sorted[0]; # Remove a random speaker from the selected class @tmp = (); foreach $ses (@sessions){ push(@tmp,$ses) if ( !($trainset{$ses}) && ($class{$ses} eq $top)); } $ses = $tmp[rand(@tmp)]; $trainset{$ses} = 1; $count{$top}--; $diff{$top} = $count{$top} - $opt{$top}; } &printtab; # Save train and testset to list files open(TRNFILE,">$trnsetfile") || die "Can't open $trnsetfile"; open(TSTFILE,">$tstsetfile") || die "Can't open $tstsetfile"; foreach $ses (@sessions){ if ($trainset{$ses}) { printf(TRNFILE "$ses\r\n"); } else { printf(TSTFILE "$ses\r\n"); } } close(TRNFILE); close(TSTFILE); printf("Trainset saved to %s\n",$trnsetfile); printf("Testset saved to %s\n",$tstsetfile); # Subroutines sub bydiff { $diff{$b} <=> $diff{$a} } sub numerically { $a <=> $b } sub sumcount { $acc = 0; foreach $bin (@classes){ $acc += $count{$bin}; } return $acc; } sub printtab { printf("\n%s\t%s\t%s\t\t%s\n","CLASS","#SPKS","OPT","DIFF"); foreach $bin (sort (@classes)){ printf("%s\t%d\t%f\t%f\n",$bin,$count{$bin},$opt{$bin},$diff{$bin}); } printf("%s\t%d\n\n","SUM",&sumcount); }