gfeature.cpp

00001 #include "gfeature.h"
00002 #include "rrf.h"
00003 
00004 GroundRRF::GroundRRF(RRF* rrf, Database* db)
00005     : rrf_(rrf), db_(db) 
00006 {
00007     numCounts_ = 0;
00008     int numFeatures = rrf_->getNumFeatures();
00009     for (int i = 0; i < numFeatures; i++) {
00010         allFeatures_.append(Array<GroundFeature*>());
00011         numCounts_ += rrf_->getFeature(i)->getNumWeights();
00012     }
00013 
00014     Array<int> nullGrounding;
00015     root_ = rrf->getRoot()->constructGroundFeature(this, nullGrounding, db);
00016     allFeatures_[rrf->getRoot()->getId()].append(root_);
00017 }
00018 
00019 
00020 void GroundRRF::getCounts(Array<double>& counts)
00021 {
00022     counts.growToSize(numCounts_);
00023 
00024     // For each weight of each feature, gather counts
00025     int countIdx = 0;
00026     for (int i = 0; i < allFeatures_.size(); i++) {
00027         for (int j = 0; j < rrf_->getFeature(i)->getNumWeights(); j++) {
00028 
00029             double totalCounts = 0.0;
00030             for (int k = 0; k < allFeatures_[i].size(); k++) {
00031                 if (allFeatures_[i][k] == NULL) {
00032                     continue;
00033                 }
00034                 totalCounts += allFeatures_[i][k]->getDeriv()
00035                     * allFeatures_[i][k]->computePartialDeriv(i,j);
00036 
00037 #if 0
00038                 cout << "Deriv " << i << " " << k << ": ";
00039                 cout << allFeatures_[i][k]->getDeriv() << endl;
00040                 cout << allFeatures_[i][k]->computePartialDeriv(i,j) << endl;
00041 #endif
00042             }
00043 #if 0
00044             double oldCounts = root_->getCounts(i,j);
00045             if (2.0*fabs((oldCounts - totalCounts)/(oldCounts + totalCounts))
00046                     > 0.01) {
00047                 cout << "Counts differ " << i << " " << j << ": \n";
00048                 cout << "Groundings: " << allFeatures_[i].size() << endl;
00049                 cout << "Deriv: " << allFeatures_[i][0]->getDeriv() << endl;
00050                 cout << "Partial: " << allFeatures_[i][0]->computePartialDeriv(i,j) << endl;
00051                 cout << oldCounts << endl;
00052                 cout << totalCounts << endl;
00053             }
00054                     
00055             counts.append(root_->getCounts(i, j));
00056 #endif
00057             counts[countIdx++] = totalCounts;
00058         }
00059     }
00060 }
00061 
00062 
00063 void GroundRRF::getPseudoCounts(Array<double>& counts, 
00064         const Array<int>& queryPreds, double samplingFrac)
00065 {
00066     // Set-up counts array
00067     counts.growToSize(numCounts_);
00068     for (int i = 0; i < numCounts_; i++) {
00069         counts[i] = 0.0;
00070     }
00071 
00072     // Get statistics for the true values; we'll need them for reference
00073     double trueSum = getLogValue();
00074 
00075     if (isnan(trueSum) || isinf(trueSum)) {
00076         for (int i = 0; i < numCounts_; i++) {
00077             counts[i] = trueSum;
00078         }
00079         return;
00080     }
00081 
00082     Array<double> trueCounts;
00083     getCounts(trueCounts);
00084 
00085     double falseSum;
00086     Array<double> falseCounts;
00087 
00088     // Sum the effect from each ground predicate
00089     for (int q = 0; q < queryPreds.size(); q++) {
00090 
00091         int i = queryPreds[q];
00092         for (int j = 0; j < getNumPredicateGroundings(i); j++) {
00093 
00094             if (frand() < samplingFrac) {
00095 
00096             // Corrupt the value of a single ground predicate
00097             bool origValue = getPredicateValue(i,j);
00098             setPredicateAndUpdate(i,j,!origValue);
00099 
00100             // Compute its contribution to the overall gradient
00101             falseSum = getLogValue();
00102             getCounts(falseCounts);
00103 
00104             double prob_false = sigmoid(falseSum - trueSum);
00105             for (int c = 0; c < numCounts_; c++) {
00106                 counts[c] += prob_false * (trueCounts[c] - falseCounts[c]);
00107             }
00108 
00109             // Reset the predicate's value
00110             setPredicateAndUpdate(i,j,origValue);
00111             }
00112         }
00113     }
00114 
00115 #if 0
00116     for (int c = 0; c < numCounts_; c++) {
00117         cout << "lpl counts[" << c << "] = " << counts[c] << endl;
00118     }
00119 #endif
00120 
00121     // Renormalize, so that we're independent of the sampling fraction
00122     for (int i = 0; i < numCounts_; i++) {
00123         counts[i] /= samplingFrac;
00124     }
00125 
00126     // Renormalize, because pll is actually *average* pll
00127     double numQueryPreds = 0.0;
00128     for (int q = 0; q < queryPreds.size(); q++) {
00129         numQueryPreds += getNumPredicateGroundings(queryPreds[q]);
00130     }
00131     for (int i = 0; i < numCounts_; i++) {
00132         counts[i] /= numQueryPreds;
00133     }
00134 }
00135 
00136 
00137 void GroundRRF::getPseudoCountsFast(Array<double>& counts, 
00138         const Array<int>& queryPreds, double samplingFrac)
00139 {
00140 #if 0 // THIS IS BROKEN -- don't use it!
00141     // Set-up counts array
00142     counts.growToSize(numCounts_);
00143     for (int i = 0; i < numCounts_; i++) {
00144         counts[i] = 0.0;
00145     }
00146 
00147     // Get statistics for the true values; we'll need them for reference
00148     double trueSum = getLogValue();
00149     Array<double> trueCounts;
00150     getCounts(trueCounts);
00151 
00152     double falseSum;
00153     Array<double> falseCounts;
00154     Array<double> falseCounts2;
00155 #if 0
00156     Array<double> counts1;
00157     Array<double> counts2;
00158     counts1.growToSize(numCounts_);
00159     counts2.growToSize(numCounts_);
00160     for (int i = 0; i < numCounts_; i++) {
00161         counts1[i] = 0.0;
00162         counts2[i] = 0.0;
00163     }
00164 #endif
00165 
00166     // Sum the effect from each ground predicate
00167     for (int q = 0; q < queryPreds.size(); q++) {
00168 
00169         int i = queryPreds[q];
00170         for (int j = 0; j < getNumPredicateGroundings(i); j++) {
00171 
00172             if (frand() < samplingFrac) {
00173 
00174             int c = 0;
00175 #if 0
00176             for (int f = 0; f < allFeatures_.size(); f++) {
00177                 Feature* feat = rrf_->getFeature(f);
00178                 for (int w = 0; w < feat->getNumWeights(); w++) {
00179                     cout << "pre[" << c <<  "] = " 
00180                         << feat->getCount(w) << endl;
00181                     feat->setCount(w,0.0);
00182                     c++;
00183                 }
00184             }
00185 #endif
00186 
00187             // Corrupt the value of a single ground predicate
00188             bool origValue = getPredicateValue(i,j);
00189             setPredicateAndUpdateCounts(i,j,!origValue);
00190 
00191             // Compute its contribution to the overall gradient
00192             falseSum = getLogValue();
00193             double prob_false = sigmoid(falseSum - trueSum);
00194 
00195             // c indexes into the count array.
00196             c = 0;
00197             for (int f = 0; f < allFeatures_.size(); f++) {
00198                 Feature* feat = rrf_->getFeature(f);
00199                 for (int w = 0; w < feat->getNumWeights(); w++) {
00200                     // DEBUG
00201                     if (c == 2) {
00202                         cout << "pfalse = " << prob_false << endl;
00203                         cout << "-featCount = " << -feat->getCount(w) << endl;
00204                     }
00205                     counts[c] += prob_false * -feat->getCount(w);
00206 #if 0
00207                     //cout << (trueCounts[c] - feat->getCount(w)) 
00208                     //    << " " << falseCounts[c] << " " << falseCounts2[c] << endl;
00209 
00210                     cout << prob_false*(trueCounts[c] - falseCounts[c]) << " " 
00211                         << prob_false*(trueCounts[c] - falseCounts2[c]) << endl;
00212 #endif
00213                     c++;
00214                 }
00215             }
00216 
00217 #if 0
00218             // DEBUG
00219             c = 0;
00220             for (int f = 0; f < allFeatures_.size(); f++) {
00221                 Feature* feat = rrf_->getFeature(f);
00222                 for (int w = 0; w < feat->getNumWeights(); w++) {
00223                     cout << "mid[" << c <<  "] = " 
00224                         << feat->getCount(w) << endl;
00225                     c++;
00226                 }
00227             }
00228 #endif
00229 
00230             // Reset the predicate's value
00231             //setPredicateAndUpdateCounts(i,j,origValue);
00232             setPredicateAndUpdateCounts(i,j,origValue);
00233 #if 0
00234             c = 0;
00235             for (int f = 0; f < allFeatures_.size(); f++) {
00236                 Feature* feat = rrf_->getFeature(f);
00237                 for (int w = 0; w < feat->getNumWeights(); w++) {
00238                     cout << "post[" << c <<  "] = " 
00239                         << feat->getCount(w) << endl;
00240                     //feat->setCount(w,0.0);
00241                     c++;
00242                 }
00243             }
00244 #endif
00245             }
00246         }
00247     }
00248 
00249     for (int c = 0; c < numCounts_; c++) {
00250         cout << "lpl1 counts[" << c << "] = " << counts[c] << endl;
00251     }
00252 
00253 #if 0
00254     for (int c = 0; c < numCounts_; c++) {
00255         cout << c << ": " << counts1[c] << " ; " << counts2[c] << endl;
00256     }
00257 #endif
00258 
00259     // Renormalize, so that we're independent of the sampling fraction
00260     for (int i = 0; i < queryPreds.size(); i++) {
00261         counts[i] /= samplingFrac;
00262     }
00263 #endif
00264 }
00265 
00266 
00267 double GroundRRF::getLogPseudoLikelihood(const Array<Predicate*>& queryPreds)
00268 {
00269     // LPL = log pseudo-likelihood, 
00270     //       the conditional probability of each query predicate 
00271     //       conditioned on all data.
00272     double lpl = 0.0;
00273     double logTruthProb = getLogValue();
00274 
00275     for (int q = 0; q < queryPreds.size(); q++) {
00276 
00277         int i = queryPreds[q]->getId();
00278         Array<int> grounding;
00279         for (int ti = 0; ti < queryPreds[q]->getNumTerms(); ti++) {
00280             grounding.append(queryPreds[q]->getTerm(ti)->getId());
00281         }
00282         int j = rrf_->getFeature(i)->getGroundingIndex(grounding, db_);
00283 
00284         if (allFeatures_[i][j] == NULL) {
00285             lpl += log(0.5);
00286         } else {
00287             double currValue = allFeatures_[i][j]->getValue();
00288             double newValue = (currValue == 0.0) ? 1.0 : 0.0;
00289             //setPredicateValue(i, j, newValue);
00290             setPredicateAndUpdate(i, j, newValue);
00291             double logUntruthProb = getLogValue();
00292             //setPredicateValue(i, j, currValue);
00293             setPredicateAndUpdate(i, j, currValue);
00294             
00295             if (fabs(logUntruthProb - logTruthProb) > 100) {
00296                 lpl += (logUntruthProb - logTruthProb);
00297             } else {
00298                 lpl += -log(1.0 + exp(logUntruthProb - logTruthProb));
00299             }
00300             //lpl += logTruthProb - log(exp(logTruthProb) + exp(logUntruthProb));
00301         }
00302     }
00303 
00304     return lpl/queryPreds.size();
00305 }
00306 
00307 
00308 double GroundRRF::getLogPseudoLikelihood(const Array<int>& queryPreds)
00309 {
00310     // LPL = log pseudo-likelihood, 
00311     //       the conditional probability of each query predicate 
00312     //       conditioned on all data.
00313     double lpl = 0.0;
00314     dirtyAll();
00315     double logTruthProb = getLogValue();
00316 
00317     // DEBUG
00318     //cout << "logTruthProb = " << logTruthProb << endl;
00319 
00320     int totalPreds = 0;
00321     for (int q = 0; q < queryPreds.size(); q++) {
00322 
00323         int i = queryPreds[q]; 
00324         for (int j = 0; j < allFeatures_[i-1].size(); j++) {
00325             totalPreds++;
00326             if (allFeatures_[i-1][j] == NULL) {
00327                 lpl += log(0.5);
00328             } else {
00329                 double currValue = allFeatures_[i-1][j]->getValue();
00330                 double newValue = (currValue == 0.0) ? 1.0 : 0.0;
00331                 //setPredicateValue(i, j, newValue);
00332                 setPredicateAndUpdate(i, j, newValue);
00333                 double logUntruthProb = getLogValue();
00334 #if 0
00335                 // DEBUG
00336                 if (isnan(logUntruthProb)) {
00337                     cout << "\nuntruthProb = " << logUntruthProb;
00338                 } else {
00339                     cout << ".";
00340                 }
00341 #endif
00342                 //setPredicateValue(i, j, currValue);
00343                 setPredicateAndUpdate(i, j, currValue);
00344 
00345 
00346                 double curr_lpl;
00347                 if (logUntruthProb - logTruthProb > 100) {
00348                     curr_lpl = logTruthProb - logUntruthProb;
00349                 } else if (logTruthProb - logUntruthProb > 100) {
00350                     curr_lpl = 0.0;
00351                 } else {
00352                     curr_lpl = -log(1.0 + exp(logUntruthProb - logTruthProb));
00353                 }
00354                 lpl += curr_lpl;
00355 #if 0
00356                 // DEBUG
00357                 //((PredicateGroundFeature*)allFeatures_[i-1][j])
00358                 //    ->getPredicate()->printWithStrVar(cout,domain);
00359                 cout << " = " << curr_lpl << endl;
00360                 cout << logUntruthProb << " ; " << logTruthProb << endl;
00361                 lpl += logTruthProb - log(exp(logTruthProb) + exp(logUntruthProb));
00362 #endif
00363             }
00364         }
00365     }
00366 
00367     // DEBUG
00368     //cout << endl;
00369 
00370     return lpl/totalPreds;
00371 }

Generated on Sun Jun 7 11:55:19 2009 for Alchemy by  doxygen 1.5.1