discriminativelearner.h

00001 /*
00002  * All of the documentation and software included in the
00003  * Alchemy Software is copyrighted by Stanley Kok, Parag
00004  * Singla, Matthew Richardson, Pedro Domingos, Marc
00005  * Sumner, Hoifung Poon, and Daniel Lowd.
00006  * 
00007  * Copyright [2004-07] Stanley Kok, Parag Singla, Matthew
00008  * Richardson, Pedro Domingos, Marc Sumner, Hoifung
00009  * Poon, and Daniel Lowd. All rights reserved.
00010  * 
00011  * Contact: Pedro Domingos, University of Washington
00012  * (pedrod@cs.washington.edu).
00013  * 
00014  * Redistribution and use in source and binary forms, with
00015  * or without modification, are permitted provided that
00016  * the following conditions are met:
00017  * 
00018  * 1. Redistributions of source code must retain the above
00019  * copyright notice, this list of conditions and the
00020  * following disclaimer.
00021  * 
00022  * 2. Redistributions in binary form must reproduce the
00023  * above copyright notice, this list of conditions and the
00024  * following disclaimer in the documentation and/or other
00025  * materials provided with the distribution.
00026  * 
00027  * 3. All advertising materials mentioning features or use
00028  * of this software must display the following
00029  * acknowledgment: "This product includes software
00030  * developed by Stanley Kok, Parag Singla, Matthew
00031  * Richardson, Pedro Domingos, Marc Sumner, Hoifung
00032  * Poon, and Daniel Lowd in the Department of Computer Science and
00033  * Engineering at the University of Washington".
00034  * 
00035  * 4. Your publications acknowledge the use or
00036  * contribution made by the Software to your research
00037  * using the following citation(s): 
00038  * Stanley Kok, Parag Singla, Matthew Richardson and
00039  * Pedro Domingos (2005). "The Alchemy System for
00040  * Statistical Relational AI", Technical Report,
00041  * Department of Computer Science and Engineering,
00042  * University of Washington, Seattle, WA.
00043  * http://www.cs.washington.edu/ai/alchemy.
00044  * 
00045  * 5. Neither the name of the University of Washington nor
00046  * the names of its contributors may be used to endorse or
00047  * promote products derived from this software without
00048  * specific prior written permission.
00049  * 
00050  * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF WASHINGTON
00051  * AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
00052  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
00053  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00054  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
00055  * OF WASHINGTON OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
00056  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00057  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00058  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00059  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
00060  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00061  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00062  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
00063  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00064  * 
00065  */
00066 #ifndef DISCRIMINATIVE_LEARNER_H_OCT_30_2005
00067 #define DISCRIMINATIVE_LEARNER_H_OCT_30_2005
00068 
00069 #include "infer.h"
00070 #include "clause.h"
00071 #include "timer.h"
00072 #include "indextranslator.h"
00073 #include "maxwalksat.h"
00074 
00075 const bool dldebug = false;
00076 const double EPSILON = .00001;
00077 
00078 #ifndef PI
00079 #define PI 3.14159265
00080 #endif
00081 
00082 double dotprod(const double* v1, const double* v2, int dim) 
00083 {
00084   double total = 0.0;
00085   for (int i = 0; i < dim; i++) 
00086     total += v1[i] * v2[i];
00087   return total;
00088 }
00089 
00090 double dotprod(const double* v1, const Array<double>& v2, int dim) 
00091 {
00092   double total = 0.0;
00093   for (int i = 0; i < dim; i++) 
00094     total += v1[i] * v2[i];
00095   return total;
00096 }
00097 
00098 double dotprod(const Array<double>& v1, const Array<double>& v2, int dim) 
00099 {
00100   double total = 0.0;
00101   for (int i = 0; i < dim; i++) 
00102   {
00103     total += v1[i] * v2[i];
00104   }
00105   return total;
00106 }
00107 
00108 double vlen(const double* v1, int dim) 
00109 { return sqrt(dotprod(v1,v1,dim)); }
00110 double vlen(const Array<double>& v1, int dim) 
00111 { return sqrt(dotprod(v1,v1,dim)); }
00112 
00113 bool isOutputIter(int iter)
00114 {
00115   while (iter % 10 == 0) 
00116     iter /= 10;
00117 
00118   if (iter == 1 || iter == 2 || iter == 5)
00119     return true;
00120   else
00121     return false;
00122 }
00123 
00124 
00125 
00131 class DiscriminativeLearner
00132 {
00133  public:
00134 
00154   DiscriminativeLearner(const Array<Inference*>& inferences,
00155                         const StringHashArray& nonEvidPredNames,
00156                         IndexTranslator* const & idxTrans,
00157                         const bool& lazyInference, const bool& withEM,
00158                         const bool& rescaleGradient, const int& method,
00159                         const double& lambda, const bool& preconditionCG,
00160                         const double& maxLambda)
00161     : domainCnt_(inferences.size()), idxTrans_(idxTrans),
00162       lazyInference_(lazyInference), rescaleGradient_(rescaleGradient),
00163       method_(method),
00164       // HACK: for now, we'll use the SMD lambda value for CG, even
00165       // though the two represent *very* different things!
00166       cg_lambda_(lambda), preconditionCG_(preconditionCG),
00167       maxBacktracks_(1000), backtrackCount_(0),
00168       cg_max_lambda_(maxLambda), withEM_(withEM) 
00169   { 
00170     cout << endl << "Constructing discriminative learner..." << endl << endl;
00171 
00172     inferences_.append(inferences);
00173     logOddsPerDomain_.growToSize(domainCnt_);
00174     clauseCntPerDomain_.growToSize(domainCnt_);
00175     
00176     for (int i = 0; i < domainCnt_; i++)
00177     {
00178       clauseCntPerDomain_[i] =
00179         inferences_[i]->getState()->getMLN()->getNumClauses();
00180       logOddsPerDomain_[i].growToSize(clauseCntPerDomain_[i], 0);
00181     }
00182 
00183     totalTrueCnts_.growToSize(domainCnt_);
00184     totalFalseCnts_.growToSize(domainCnt_);
00185     defaultTrueCnts_.growToSize(domainCnt_);
00186     defaultFalseCnts_.growToSize(domainCnt_);
00187     relevantClausesPerDomain_.growToSize(domainCnt_);
00188     //relevantClausesFormulas_ is set in findRelevantClausesFormulas()
00189 
00190     findRelevantClauses(nonEvidPredNames);
00191     findRelevantClausesFormulas();
00192 
00193       // Initialize the clause wts
00194     initializeWts(nonEvidPredNames);
00195     
00196       // Initialize the inference / state
00197     for (int i = 0; i < inferences_.size(); i++)
00198       inferences_[i]->init();
00199   }
00200 
00201   ~DiscriminativeLearner() 
00202   {
00203     for (int i = 0; i < trainTrueCnts_.size(); i++)
00204     {
00205       delete[] trainTrueCnts_[i];
00206       delete[] trainFalseCnts_[i];
00207     }
00208   }
00209 
00210     // set the prior means and std devs.
00211   void setMeansStdDevs(const int& arrSize, const double* const & priorMeans, 
00212                        const double* const & priorStdDevs) 
00213   {
00214     if (arrSize < 0) 
00215     {
00216       usePrior_ = false;
00217       priorMeans_ = NULL;
00218       priorStdDevs_ = NULL;
00219     } 
00220     else 
00221     {
00222       //cout << "arr size = " << arrSize<<", clause count = "<<clauseCnt_<<endl;
00223       usePrior_ = true;
00224       priorMeans_ = priorMeans;
00225       priorStdDevs_ = priorStdDevs;
00226 
00227       //cout << "\t\t Mean \t\t Std Deviation" << endl;
00228       //for (int i = 0; i < arrSize; i++) 
00229       //  cout << i << "\t\t" << priorMeans_[i]<<"\t\t"<<priorStdDevs_[i]<<endl;
00230     }
00231   }
00232 
00233 
00234   void setMLNWeights(double* const& weights)
00235   {
00236       // If there is one db or the clauses for multiple databases line up
00237     if (idxTrans_ == NULL)
00238     {
00239       int clauseCnt = clauseCntPerDomain_[0];
00240       for (int i = 0; i < domainCnt_; i++)
00241       {
00242         assert(clauseCntPerDomain_[i] == clauseCnt);
00243         const MLN* mln = inferences_[i]->getState()->getMLN();
00244         
00245         // Clauses are typically shared among MLNs for different
00246         // domains, so many of these calls to setWt() are redundant.
00247         // However, it doesn't hurt to set the same weight twice.
00248         for (int j = 0; j < clauseCnt; j++) 
00249         {
00250           Clause* c = (Clause*) mln->getClause(j);
00251           c->setWt(weights[j]);
00252         }
00253       }
00254     }
00255     else
00256     {   // The clauses for multiple databases do not line up
00257       Array<Array<double> >* wtsPerDomain = idxTrans_->getWtsPerDomain();
00258       const Array<Array<Array<IdxDiv>*> >* cIdxToCFIdxsPerDomain 
00259         = idxTrans_->getClauseIdxToClauseFormulaIdxsPerDomain();
00260       
00261       for (int i = 0; i < domainCnt_; i++)
00262       {
00263         Array<double>& wts = (*wtsPerDomain)[i];
00264         memset((double*)wts.getItems(), 0, wts.size()*sizeof(double));
00265 
00266           //map clause/formula weights to clause weights
00267         for (int j = 0; j < wts.size(); j++)
00268         {
00269           Array<IdxDiv>* idxDivs = (*cIdxToCFIdxsPerDomain)[i][j];          
00270           for (int k = 0; k < idxDivs->size(); k++)
00271             wts[j] += weights[ (*idxDivs)[k].idx ] / (*idxDivs)[k].div;
00272         }
00273       }
00274       
00275       for (int i = 0; i < domainCnt_; i++)
00276       {
00277         Array<bool>& relevantClauses = relevantClausesPerDomain_[i];
00278         int clauseCnt = clauseCntPerDomain_[i];
00279         Array<double>& wts = (*wtsPerDomain)[i];
00280         assert(wts.size() == clauseCnt);
00281         const MLN* mln = inferences_[i]->getState()->getMLN();
00282 
00283         for (int j = 0; j < clauseCnt; j++)
00284         {
00285           Clause* c = (Clause*) mln->getClause(j);
00286           if(relevantClauses[j]) c->setWt(wts[j]);
00287           else                   c->setWt(0);
00288         }
00289       }
00290     }
00291   }
00292 
00293 
00294   void setLogOddsWeights(double* weights, int numWeights)
00295   {
00296       // If there is one db or the clauses for multiple databases line up
00297     if (idxTrans_ == NULL)
00298     {
00299       for (int i = 0; i < domainCnt_; i++)
00300       {
00301         Array<double>& logOdds = logOddsPerDomain_[i];
00302         assert(numWeights == logOdds.size());
00303         for (int j = 0; j < logOdds.size(); j++) weights[j] += logOdds[j];
00304       }
00305     }
00306     else
00307     { //the clauses for multiple databases do not line up
00308       const Array<Array<Array<IdxDiv>*> >* cIdxToCFIdxsPerDomain 
00309         = idxTrans_->getClauseIdxToClauseFormulaIdxsPerDomain();
00310 
00311       Array<int> numLogOdds; 
00312       Array<double> wtsForDomain;
00313       numLogOdds.growToSize(numWeights);
00314       wtsForDomain.growToSize(numWeights);
00315     
00316       for (int i = 0; i < domainCnt_; i++)
00317       {
00318         memset((int*)numLogOdds.getItems(), 0, numLogOdds.size()*sizeof(int));
00319         memset((double*)wtsForDomain.getItems(), 0,
00320                wtsForDomain.size()*sizeof(double));
00321 
00322         Array<double>& logOdds = logOddsPerDomain_[i];
00323       
00324           // Map the each log odds of a clause to the weight of a
00325           // clause/formula
00326         for (int j = 0; j < logOdds.size(); j++)
00327         {
00328           Array<IdxDiv>* idxDivs =(*cIdxToCFIdxsPerDomain)[i][j];          
00329           for (int k = 0; k < idxDivs->size(); k++)
00330           {
00331             wtsForDomain[ (*idxDivs)[k].idx ] += logOdds[j];
00332             numLogOdds[ (*idxDivs)[k].idx ]++;
00333           }
00334         }
00335 
00336         for (int j = 0; j < numWeights; j++)
00337           if (numLogOdds[j] > 0) weights[j] += wtsForDomain[j]/numLogOdds[j];  
00338       }
00339     }
00340 
00341     for (int i = 0; i < numWeights; i++)
00342       weights[i] /= domainCnt_;
00343   }
00344 
00345 
00346   void getVariance(Array<double>& variance, int numWeights)
00347   {
00348     if (dldebug)
00349       cout << "Variances:" << endl;
00350 
00351       // Compute variance and gradient for each clause
00352     for (int clauseno = 0; clauseno < numWeights; clauseno++)
00353     {
00354       variance[clauseno] = 0.0;
00355 
00356         // Takes into account the prior
00357       if (usePrior_)
00358       {
00359          double sd = priorStdDevs_[clauseno];
00360          variance[clauseno] = 1.0/(sd * sd);
00361       }
00362 
00363         // Sum variance over all domains
00364       for (int i = 0; i < domainCnt_; i++)
00365       {
00366         const Array<double>* trueCnts = inferences_[i]->getClauseTrueCnts();
00367         const Array<double>* trueSqCnts = inferences_[i]->getClauseTrueSqCnts();
00368         int totalSamples = inferences_[i]->getNumSamples();
00369 
00370         double x   = (*trueCnts)[clauseno];
00371         double xsq = (*trueSqCnts)[clauseno];
00372         double n   = totalSamples;
00373 
00374           // Add variance for this domain
00375         variance[clauseno] += xsq/n - (x/n)*(x/n);
00376       }
00377       
00378       if (dldebug)
00379         cout << clauseno << ": " << variance[clauseno] << endl;
00380     }
00381   }
00382 
00383 
00384   const Array<double>* getHessianVectorProduct(const Array<double>& d)
00385   {
00386       // Compute Hessian vector product using counts from all inferences
00387     const Array<double>* Hd = inferences_[0]->getHessianVectorProduct(d);
00388     for (int i = 1; i < domainCnt_; i++) 
00389     {
00390       const Array<double>* Hd_i = inferences_[i]->getHessianVectorProduct(d);
00391       for (int j = 0; j < Hd->size(); j++) 
00392         (*Hd)[j] += (*Hd_i)[j];
00393       delete Hd_i;
00394     }
00395 
00396     return Hd;
00397   }
00398 
00399 
00400   double computeQuadraticStepLength(double* gradient, const Array<double>& d, 
00401           const Array<double>* Hd, double lambda)
00402   {
00403     int numWeights = d.size();
00404 
00405       // Compute step length using trust region approach
00406     double dHd = dotprod(d, (*Hd), numWeights);
00407     double dd = dotprod(d, d, numWeights);
00408     double dg = dotprod(gradient, d, numWeights);
00409     double alpha = -dg/(dHd + lambda * dd);
00410 
00411       // Debug messages; can be helpful for diagnosing what is going on
00412     if (dldebug)
00413     {
00414       cout << "dHd = " << dHd << endl;
00415       cout << "dd = " << dd << endl;
00416       cout << "dg = " << dg << endl;
00417       cout << "alpha = " << alpha << endl;
00418     }
00419 
00420       // Because the problem is convex, the Hessian should always
00421       // be positive definite, and so alpha should always be non-negative.
00422       // Since our Hessian is approximate, we may end up with a negative
00423       // alpha.  In these cases, we used to make alpha zero (i.e., take a 
00424       // step of size zero), but now we return the negative alpha and
00425       // let the caller deal with it.
00426     if (alpha < 0.0) 
00427     {
00428         //alpha = 0.0;
00429       if (dldebug)
00430       {
00431         cout << "Alpha < 0!  Bad direction or Hessian." << endl;
00432         //cout << "Flipped alpha!  Setting step length to zero." << endl;
00433       }
00434     }
00435 
00436     return alpha;
00437   }
00438 
00439 
00440     // learn the weights
00441   void learnWeights(double* const & weights, const int& numWeights,
00442                     const int& maxIter, const double& maxSec,
00443                     const double& learningRate, const double& momentum,
00444                     bool initWithLogOdds, const int& mwsMaxSubsequentSteps, 
00445                     bool periodicMLNs)
00446   {
00447     Timer timer;
00448     double begSec = timer.time();
00449 
00450     //cout << "Learning weights discriminatively... " << endl;
00451     memset(weights, 0, numWeights*sizeof(double));
00452 
00453     double* averageWeights = new double[numWeights];
00454     double* gradient = new double[numWeights];
00455 
00456       // For predicting and assessing changes to the gradient
00457     const Array<double>* delta_pred = NULL;
00458     const Array<double>* Hd = NULL;
00459 
00460       // For all methods
00461     double alpha = 1;
00462     Array<double> d(numWeights, 0.0);
00463     double oldgradient[numWeights];
00464     Array<double> oldd(numWeights, 0.0);
00465 
00466     //double pre_backtrack_lambda = cg_lambda_;
00467 
00468       // Set the initial weight to the average log odds across domains/databases
00469     if (initWithLogOdds)
00470       setLogOddsWeights(weights, numWeights);
00471 
00472       // Initialize averageWeights
00473     memcpy(averageWeights, weights, numWeights*sizeof(double));
00474 
00475       // Make sure to save all counts, for computing gradient
00476     for (int i = 0; i < domainCnt_; i++)
00477       inferences_[i]->saveAllCounts(true);
00478 
00479       // For DN method
00480     Array<double> currVariance(numWeights, 0);
00481 
00482       // Used in CG
00483     Array<double> oldweights(numWeights, 0.0);
00484 
00485       // True if we're backtracking -- prevents us from making a forward
00486       // step if we've already committed to reverting our weights.
00487     bool backtracked = false;
00488 
00489     //int itersSinceSignificant = 0;
00490 
00491     for (int iter = 1; iter <= maxIter; iter++) 
00492     {
00493       cout << endl << "Iteration " << iter << " : " << endl << endl;
00494 
00495       double totalsec = timer.time() - begSec;
00496       int hour = (int)totalsec/3600;
00497       int min = (int)(totalsec - 3600*hour)/60;
00498       int sec = (int)(totalsec - 3600*hour - 60*min);
00499 
00500         // Print time so far
00501       cout << "Elapsed time: " << totalsec << "s";
00502       if (hour > 0)
00503         cout << " (" << hour << "h " << min << "m " << sec << "s)";
00504       else if (min > 0)
00505         cout << " (" << min << "m " << sec << "s)";
00506       cout << endl;
00507 
00508       if (maxSec > 0 && totalsec > maxSec)
00509       {
00510         cout << "Time limit exceeded.  Stopping learning." << endl;
00511         break;
00512       }
00513 
00514         // In 3rd iteration, we want to tell MWS to perform subsequentSteps
00515         // (Iter. 1 is random assigment if initial weights are 0)
00516       if (iter == 3)
00517       {
00518         for (int i = 0; i < inferences_.size(); i++)
00519         {
00520             // Check if using MWS
00521           if (MaxWalkSat* mws = dynamic_cast<MaxWalkSat*>(inferences_[i]))
00522           {
00523             mws->setMaxSteps(mwsMaxSubsequentSteps);
00524           }
00525         }
00526       }
00527 
00528 #if 0
00529       int tcounts[numWeights]; 
00530       int tcountsTotal[numWeights]; 
00531       int icounts[numWeights]; 
00532       int icountsTotal[numWeights]; 
00533 #endif
00534 
00535       // Set the weights and run inference
00536       setMLNWeights(weights);
00537     
00538       if (withEM_) fillInMissingValues();
00539       cout << "Running inference ..." << endl;
00540       infer();
00541       cout << "Done with inference" << endl;
00542 
00543       cout << "Getting gradient..." << endl;
00544       //getGradient(weights, gradient, numWeights,
00545       //        tcounts, tcountsTotal, icounts, icountsTotal);
00546       getGradient(weights, gradient, numWeights);
00547 
00548       double realdist = 1.0;
00549       double preddist = 1.0;
00550       if (iter > 1 && delta_pred != NULL && !backtracked)
00551       {
00552         Array<double> dist(numWeights, 0.0);
00553         for (int i = 0; i < numWeights; i++)
00554           dist[i] = weights[i] - oldweights[i];
00555 
00556         // Below are four different methods for estimating
00557         // the real and predicted change in the function value.
00558         // These values are used to size of the trust region
00559         // by updating lambda.
00560 #if 1
00561         // Predicted change is quadratic approximation
00562         Array<double> avgPred(numWeights);
00563         avgPred.growToSize(numWeights);
00564         for (int i = 0; i < numWeights; i++)
00565           avgPred[i] = oldgradient[i] + (*delta_pred)[i]/2.0;
00566         preddist = dotprod(avgPred, dist, numWeights);
00567 
00568         // Real change is lower bound on actual change
00569         realdist = dotprod(gradient, dist, numWeights);
00570 #elif 0
00571         // Actual distance is a lower bound on the change 
00572         // in our function: new gradient * dist
00573         // Predicted change adopts the same form, but uses
00574         // the predicted gradient instead: pred gradient * dist
00575         Array<double> predgradient(numWeights, 0.0);
00576         for (int i = 0; i < numWeights; i++)
00577           predgradient[i] = oldgradient[i] + (*delta_pred)[i];
00578 
00579         preddist = dotprod(predgradient, dist, numWeights);
00580         realdist = dotprod(gradient, dist, numWeights);
00581 #elif 0
00582         // Act as if we're optimizing the dot product of the
00583         // gradient with the search direction instead of
00584         // the function value itself.  In practice, this doesn't work
00585         // well.
00586         Array<double> delta_actual(numWeights, 0.0);
00587         for (int i = 0; i < numWeights; i++)
00588           delta_actual[i] = gradient[i] - oldgradient[i];
00589 
00590         preddist = dotprod(*delta_pred, dist, numWeights);
00591         realdist = dotprod(delta_actual, dist, numWeights);
00592 #else
00593         // Assume quadratic behavior of function
00594         // If this test passes, then either our quadratic approx is
00595         // good, or we were strangely lucky.  In practice, this isn't
00596         // very stable, since our function doesn't act quadratically.
00597         Array<double> avgPred(numWeights, 0.0);
00598         Array<double> avgActual(numWeights, 0.0);
00599         for (int i = 0; i < numWeights; i++)
00600         {
00601           avgPred[i] = oldgradient[i] + (*delta_pred)[i]/2.0;
00602           avgActual[i] = (oldgradient[i] + gradient[i])/2.0;
00603         }
00604 
00605         preddist = dotprod(avgPred, dist, numWeights);
00606         realdist = dotprod(avgActual, dist, numWeights);
00607 #endif
00608 
00609           // LOG: Print out the trust-region diagnostics.
00610         cout << "Pred*dist = " << preddist << endl;
00611         cout << "Real*dist = " << realdist << endl;
00612         if (preddist > 0)
00613           cout << "Distratio = " << realdist/preddist << endl;
00614       }
00615 
00616       if (iter > 1 && (method_ == CG || method_ == DN))
00617       {
00618 
00619         // Adjust lambda using technique of (Fletcher, 1987)
00620         double delta = realdist/preddist;
00621 
00622         if (!backtracked && preddist == 0)
00623           cg_lambda_ /= 4;
00624 
00625 #if 0
00626         // In general, overshooting the predicted gain in
00627         // function value is a good thing.  However, since we're
00628         // doing everything approximately, we may wish to adjust
00629         // lambda so that our quadratic approximation is always
00630         // close.  These update formulas penalize overshooting.
00631         if (delta > 0.75 && delta < 1.333)
00632           cg_lambda_ /= 2;
00633         if (delta < 0.25 || delta > 4.0)
00634         {
00635           if (cg_lambda_ * 4 > cg_max_lambda_)
00636             cg_lambda_ = cg_max_lambda_;
00637           else
00638             cg_lambda_ *= 4;
00639         }
00640 #else
00641         if (!backtracked && preddist != 0 && delta > 0.75)
00642           cg_lambda_ /= 2;
00643         // if (delta < 0.25)   // Update schedule from (Fletcher, 1987)
00644         if (delta < 0.0)       // Gentler update schedule, to handle noise
00645         {
00646           if (cg_lambda_ * 4 > cg_max_lambda_)
00647             cg_lambda_ = cg_max_lambda_;
00648           else
00649             cg_lambda_ *= 4;
00650         }
00651 #endif
00652         cout << "Lambda = " << cg_lambda_ << endl;
00653 
00654         if (delta < 0.0 && backtrackCount_ < maxBacktracks_)
00655         {
00656           cout << "Backtracking!" << endl;
00657           iter--;
00658 #if 0
00659           double oldalpha = alpha;
00660           alpha = computeQuadraticStepLength(oldgradient, d, Hd, cg_lambda_);
00661           for (int i = 0; i < numWeights; i++)
00662           {
00663             weights[i] = oldweights[i] + alpha * d[i];
00664             (*delta_pred)[i] *= alpha/oldalpha;
00665           }
00666 #else
00667           for (int i = 0; i < numWeights; i++)
00668             weights[i] = oldweights[i];
00669 
00670 #endif
00671           for (int i = 0; i < domainCnt_; i++) 
00672             inferences_[i]->restoreCnts();
00673 
00674           backtracked = true;
00675           backtrackCount_++;
00676         }
00677         else
00678         {
00679           backtracked = false;
00680           backtrackCount_ = 0;
00681         }
00682       }
00683 
00684       if (!backtracked)
00685       {
00686         for (int i = 0; i < domainCnt_; i++)
00687           inferences_[i]->saveCnts();
00688       }
00689 
00690 
00691 #if 0
00692       // Check for convergence.  If we're backtracking, skip these tests.
00693       if (!backtracked)
00694       {
00695        for (int i = 0; i < numWeights; i++) 
00696        {
00697         // Smooth with Laplace prior
00698         double priorCounts = 1.0;  
00699         tcounts[i]      += priorCounts;
00700         tcountsTotal[i] += priorCounts * 2.0;
00701         icounts[i]      += priorCounts;
00702         icountsTotal[i] += priorCounts * 2.0;
00703 
00704         // Adjust training counts by the Gaussian prior
00705         double sd = priorStdDevs_[i];
00706         double effectiveCounts = tcounts[i] - 
00707             (weights[i] - priorMeans_[i])/(sd*sd);
00708 
00709         if (tcountsTotal[i] == 0) 
00710           continue;
00711 
00712         double p = effectiveCounts/tcountsTotal[i];
00713         if (p < 0.0) { p = 0.0; }
00714         if (p > 1.0) { p = 1.0; }
00715         double cdf = // compute CDF for binomial here...
00716 
00717         // TODO: make the significance level a command-line parameter
00718         if (cdf < 0.05 || cdf > 0.95) 
00719         {
00720           // DEBUG
00721           cout << "i = " << i << endl;
00722           cout << "tc' = " << effectiveCounts << endl;
00723           cout << "tc = " << tcounts[i] << "; tct = " << tcountsTotal[i] << endl;
00724           cout << "ic = " << icounts[i] << "; ict = " << icountsTotal[i] << endl;
00725           cout << "Base prob: " << p << endl;
00726           cout << "Inferred prob: " << (double)icounts[i]/icountsTotal[i] << endl;
00727           cout << "cdf = " << cdf << endl;
00728 
00729           itersSinceSignificant = 0;
00730           break;
00731         }
00732        }
00733 
00734        // If no difference has been significant for 10 iterations, stop.
00735        // TODO: make the constant 10 a command-line parameter.
00736        itersSinceSignificant++;
00737        if (itersSinceSignificant > 10) 
00738        {
00739           cout << "No significant difference in any dimension after "
00740               << "10 iterations; halting!";
00741           break;
00742        }
00743       }
00744 #endif
00745 
00746 
00747         // Scaled conjugate gradient: compute conjugate gradient direction
00748         //     using Polak-Ribiere and step size using Hessian matrix.
00749         //     Optional preconditioner (inverse diagonalized Hessian)
00750       if (method_ == CG && !backtracked)
00751       {
00752         // Precond stores the diagonal entries of preconditioning matrix
00753         Array<double> precond(numWeights, 1.0);
00754 
00755         // If preconditioning, use the inverse diagonalized Hessian
00756         if (preconditionCG_)
00757         {
00758           Array<double> variance(numWeights, 0.0);
00759           getVariance(variance, numWeights);
00760 
00761           for (int clauseno = 0; clauseno < numWeights; clauseno++)
00762             precond[clauseno] = 1.0/variance[clauseno];
00763             //precond[clauseno] = 1.0/(variance[clauseno] + 1.0);
00764         }
00765 
00766         double beta = 0.0;
00767 
00768           // Compute beta using Polak-Ribiere form:
00769           //   beta = g_j+1 (g_j+1 - g_j) / (g_j g_j)
00770           // Preconditioned:
00771           //   beta = g_j+1 M-1 (g_j+1 - g_j) / (g_j M-1 g_j)
00772         double beta_num = 0.0;
00773         double beta_denom = 0.0;
00774         
00775         if (iter > 1)
00776         {
00777           for (int i = 0; i < numWeights; i++)
00778           {
00779             beta_num   += gradient[i] * precond[i] 
00780                 * (gradient[i] - oldgradient[i]);
00781             beta_denom += oldgradient[i] * precond[i] * oldgradient[i];
00782           }
00783           beta = beta_num/beta_denom;
00784         }
00785         else
00786           beta = 0.0;
00787 
00788         if (dldebug)
00789           cout << "Beta = " << beta << endl;
00790         
00791           // Compute new direction
00792         for (int w = 0; w < numWeights; w++)
00793           d[w] = -precond[w]*gradient[w] + beta*oldd[w];
00794 
00795         delete Hd;
00796         Hd = getHessianVectorProduct(d);
00797         alpha = computeQuadraticStepLength(gradient, d, Hd, cg_lambda_);
00798 
00799         // HACK DEBUG -- if this isn't working, ignore the conjugacy
00800         // criteria since it sometimes messes us up.
00801         if (alpha < 0.0)
00802         {
00803           for (int w = 0; w < numWeights; w++)
00804             d[w] = -precond[w]*gradient[w];
00805 
00806           delete Hd;
00807           Hd = getHessianVectorProduct(d);
00808           alpha = computeQuadraticStepLength(gradient, d, Hd, cg_lambda_);
00809         }
00810       }
00811 
00812         // Diagonal Newton: divide gradient by clause variance,
00813         //     and use Hessian to pick step length
00814       if (method_ == DN && !backtracked)
00815       {
00816         Array<double> variance(numWeights, 0.0);
00817         getVariance(variance, numWeights);
00818 
00819         for (int w = 0; w < numWeights; w++) 
00820         {
00821           if (fabs(gradient[w]) < 0.00000001)
00822             gradient[w] = 0;
00823           d[w] = -gradient[w]/variance[w];
00824         }
00825 
00826         delete Hd;
00827         Hd = getHessianVectorProduct(d);
00828         alpha = computeQuadraticStepLength(gradient, d, Hd, cg_lambda_);
00829         // alpha = learningRate; (Alternate method for choosing step length.)
00830       } 
00831 
00832       if (method_ == SIMPLE)
00833       {
00834           // Search direction is just the gradient
00835         for (int w = 0; w < numWeights; w++)
00836           d[w] = -gradient[w];
00837 
00838           // Step length scalar is the learning rate
00839         alpha = learningRate;
00840       }
00841 
00842       if (!backtracked && alpha <= 0.0)
00843       {
00844         // If alpha is negative, then either the direction or the
00845         // Hessian is in error.  We call this a backtrack so that
00846         // we can gather more samples while keeping the old samples.
00847         backtracked = true;
00848         iter--;
00849       }
00850 
00851       if (!backtracked)
00852       {
00853           // Compute total weight change
00854         Array<double> wchange(numWeights, 0.0);
00855         for (int w = 0; w < numWeights; w++) 
00856           wchange[w] = d[w] * alpha + (weights[w] - oldweights[w]) * momentum;
00857 
00858         // Convergence criteria for 2nd order methods: 
00859         // Stop when the maximum predicted improvement in log likelihood 
00860         // is very small.
00861         const double MIN_LL_CHANGE = 0.0001;
00862         double maxchange = -dotprod(gradient, wchange, numWeights);
00863         cout << "Maximum estimated improvement: " << maxchange << endl;
00864         if ((method_ == CG || method_ == DN) &&
00865             maxchange < MIN_LL_CHANGE)
00866         {
00867           cout << "Upper bound is less than " << MIN_LL_CHANGE
00868                << "; halting learning." << endl;
00869           break;
00870         }
00871 
00872         // Save weights, gradient, and direction and adjust the weights
00873         for (int w = 0; w < numWeights; w++)
00874         {
00875           oldweights[w] = weights[w];
00876           oldd[w] = d[w];
00877           oldgradient[w] = gradient[w];
00878 
00879           weights[w] += wchange[w];
00880           averageWeights[w] = (iter * averageWeights[w] + weights[w])/(iter + 1);
00881         }
00882 
00883         // Predict the next change to the gradient using the Hessian
00884         delete delta_pred;
00885         delta_pred = getHessianVectorProduct(wchange);
00886 
00887         // Reset counts.  
00888         // (If we're backtracking, we DO NOT want to
00889         // reset counts, since we're restoring the old counts. 
00890         // That's why this is handled here.)
00891         for (int i = 0; i < domainCnt_; i++) 
00892           inferences_[i]->resetCnts();
00893       }
00894 
00895       bool print = !backtracked && (!periodicMLNs || isOutputIter(iter));
00896       if (print)
00897       {
00898         cout << "Weights:\tNew\t\tAverage\n";
00899         for (int w = 0; w < numWeights; w++)
00900           cout << w << ":\t\t" << weights[w] << "\t\t"
00901                << averageWeights[w] << endl; 
00902       }
00903 
00904       // done with an iteration
00905     }
00906     
00907     cout << endl << "Learned Weights : " << endl;
00908     for (int w = 0; w < numWeights; w++) 
00909     {
00910         // If using MWS, then assign average weights to MLN
00911       if (dynamic_cast<MaxWalkSat*>(inferences_[0]))
00912       {
00913         weights[w] = averageWeights[w];
00914       }
00915       cout << w << ":" << weights[w] << endl;
00916     }
00917 
00918     delete [] averageWeights;
00919     delete [] gradient;
00920     
00921     resetDBs();
00922   }
00923  
00924  
00925  private:
00926  
00930   void resetDBs() 
00931   {
00932     if (!lazyInference_)
00933     {
00934       for (int i = 0; i < domainCnt_; i++) 
00935       {
00936         VariableState* state = inferences_[i]->getState();
00937         Database* db = state->getDomain()->getDB();
00938           // Change known NE to original values
00939         const GroundPredicateHashArray* knePreds = state->getKnePreds();
00940         const Array<TruthValue>* knePredValues = state->getKnePredValues();      
00941         db->setValuesToGivenValues(knePreds, knePredValues);
00942           // Set unknown NE back to UKNOWN
00943         const GroundPredicateHashArray* unePreds = state->getUnePreds();
00944         for (int predno = 0; predno < unePreds->size(); predno++) 
00945           db->setValue((*unePreds)[predno], UNKNOWN);
00946       }
00947     }
00948   }
00949 
00955   void findRelevantClauses(const StringHashArray& nonEvidPredNames) 
00956   {
00957     for (int d = 0; d < domainCnt_; d++)
00958     {
00959       int clauseCnt = clauseCntPerDomain_[d];
00960       Array<bool>& relevantClauses = relevantClausesPerDomain_[d];
00961       relevantClauses.growToSize(clauseCnt);
00962       memset((bool*)relevantClauses.getItems(), false, 
00963              relevantClauses.size()*sizeof(bool));
00964       const Domain* domain = inferences_[d]->getState()->getDomain();
00965       const MLN* mln = inferences_[d]->getState()->getMLN();
00966     
00967       const Array<IndexClause*>* indclauses;
00968       const Clause* clause;
00969       int predid, clauseid;
00970       for (int i = 0; i < nonEvidPredNames.size(); i++)
00971       {
00972         predid = domain->getPredicateId(nonEvidPredNames[i].c_str());
00973         //cout << "finding the relevant clauses for predid = " << predid 
00974         //     << " in domain " << d << endl;
00975         indclauses = mln->getClausesContainingPred(predid);
00976         if (indclauses) 
00977         {
00978           for (int j = 0; j < indclauses->size(); j++) 
00979           {
00980             clause = (*indclauses)[j]->clause;                  
00981             clauseid = mln->findClauseIdx(clause);
00982             
00983             // If clause is external to this mln, then it stays irrelevant
00984             if (!mln->isExternalClause(clauseid)) relevantClauses[clauseid] = true;
00985           }
00986         }
00987       }    
00988     }
00989   }
00990 
00991   
00992   void findRelevantClausesFormulas()
00993   {
00994     if (idxTrans_ == NULL)
00995     {
00996       Array<bool>& relevantClauses = relevantClausesPerDomain_[0];
00997       relevantClausesFormulas_.growToSize(relevantClauses.size());
00998       for (int i = 0; i < relevantClauses.size(); i++)
00999         relevantClausesFormulas_[i] = relevantClauses[i];
01000     }
01001     else
01002     {
01003       idxTrans_->setRelevantClausesFormulas(relevantClausesFormulas_,
01004                                             relevantClausesPerDomain_[0]);
01005       cout << "Relevant clauses/formulas:" << endl;
01006       idxTrans_->printRelevantClausesFormulas(cout, relevantClausesFormulas_);
01007       cout << endl;
01008     }
01009   }
01010 
01011 
01021   void calculateCounts(Array<double>& trueCnt, Array<double>& falseCnt,
01022                        const int& domainIdx, const bool& hasUnknownPreds) 
01023   {
01024     Clause* clause;
01025     double tmpUnknownCnt;
01026     int clauseCnt = clauseCntPerDomain_[domainIdx];
01027     Array<bool>& relevantClauses = relevantClausesPerDomain_[domainIdx];
01028     const MLN* mln = inferences_[domainIdx]->getState()->getMLN();
01029     const Domain* domain = inferences_[domainIdx]->getState()->getDomain();
01030 
01031     for (int clauseno = 0; clauseno < clauseCnt; clauseno++) 
01032     {
01033       if (!relevantClauses[clauseno]) 
01034       {
01035         continue;
01036         //cout << "\n\nthis is an irrelevant clause.." << endl;
01037       }
01038       clause = (Clause*) mln->getClause(clauseno);
01039       clause->getNumTrueFalseUnknownGroundings(domain, domain->getDB(), 
01040                                                hasUnknownPreds,
01041                                                trueCnt[clauseno],
01042                                                falseCnt[clauseno],
01043                                                tmpUnknownCnt);
01044       assert(hasUnknownPreds || (tmpUnknownCnt==0));
01045     }
01046   }
01047 
01048 
01056   void initializeWts(const StringHashArray& nonEvidPredNames)
01057   {
01058     cout << "Initializing weights ..." << endl;
01059     
01060     bool hasUnknownPreds;
01061     
01062     Array<Predicate*> gpreds;
01063     Array<Predicate*> ppreds;
01064     Array<TruthValue> gpredValues;
01065     Array<TruthValue> tmpValues;
01066 
01067     if (!lazyInference_)
01068     { // Eager inference
01069       trainTrueCnts_.growToSize(domainCnt_);
01070       trainFalseCnts_.growToSize(domainCnt_);
01071     }
01072     
01073     for (int i = 0; i < domainCnt_; i++)
01074     {
01075       if (dldebug) cout << "Domain " << i << endl;
01076       int clauseCnt = clauseCntPerDomain_[i];
01077       if (dldebug) cout << "Clause count: " << clauseCnt << endl;
01078       VariableState* state = inferences_[i]->getState();
01079       Domain* domain = (Domain*)state->getDomain();
01080 
01081       if (lazyInference_)
01082       {
01083         domain->getDB()->setPerformingInference(false);
01084   
01085         //cout << endl << "Getting the counts for the domain " << i << endl;
01086         gpreds.clear();
01087         gpredValues.clear();
01088         tmpValues.clear();
01089         for (int predno = 0; predno < nonEvidPredNames.size(); predno++) 
01090         {
01091           ppreds.clear();
01092           int predid = domain->getPredicateId(nonEvidPredNames[predno].c_str());
01093           Predicate::createAllGroundings(predid, domain, ppreds);
01094           gpreds.append(ppreds);
01095         }
01096       
01097         domain->getDB()->alterTruthValue(&gpreds, UNKNOWN, FALSE, &gpredValues);
01098       
01099         hasUnknownPreds = false;
01100       
01101         Array<double>& trueCnt = totalTrueCnts_[i];
01102         Array<double>& falseCnt = totalFalseCnts_[i];
01103         trueCnt.growToSize(clauseCnt);
01104         falseCnt.growToSize(clauseCnt);
01105         calculateCounts(trueCnt, falseCnt, i, hasUnknownPreds);
01106 
01107         //cout << "got the total counts..\n\n\n" << endl;
01108         
01109         hasUnknownPreds = true;
01110 
01111         domain->getDB()->setValuesToUnknown(&gpreds, &tmpValues);
01112 
01113         Array<double>& dTrueCnt = defaultTrueCnts_[i];
01114         Array<double>& dFalseCnt = defaultFalseCnts_[i];
01115         dTrueCnt.growToSize(clauseCnt);
01116         dFalseCnt.growToSize(clauseCnt);
01117         calculateCounts(dTrueCnt, dFalseCnt, i, hasUnknownPreds);
01118 
01119         for (int predno = 0; predno < gpreds.size(); predno++) 
01120           delete gpreds[predno];
01121 
01122         domain->getDB()->setPerformingInference(true);        
01123       }
01124       else
01125       { // Eager inference
01126         const GroundPredicateHashArray* unePreds = state->getUnePreds();
01127         const GroundPredicateHashArray* knePreds = state->getKnePreds();
01128 
01129         trainTrueCnts_[i] = new double[clauseCnt];
01130         trainFalseCnts_[i] = new double[clauseCnt];
01131 
01132         if (dldebug)
01133         {
01134           cout << "Unknown non-evid preds: " << unePreds->size() << endl;
01135           cout << "Known non-evid preds: " << knePreds->size() << endl;
01136         }
01137         int totalPreds = unePreds->size() + knePreds->size();
01138           // Used to store gnd preds to be ignored in the count because they are
01139           // UNKNOWN
01140         Array<bool>* unknownPred = new Array<bool>;
01141         unknownPred->growToSize(totalPreds, false);
01142         for (int predno = 0; predno < totalPreds; predno++) 
01143         {
01144           GroundPredicate* p;
01145           if (predno < unePreds->size())
01146             p = (*unePreds)[predno];
01147           else
01148             p = (*knePreds)[predno - unePreds->size()];
01149           TruthValue tv = state->getDomain()->getDB()->getValue(p);
01150 
01151           //assert(tv != UNKNOWN);
01152           //bool activate = true;
01153           bool activate = false;
01154           if (tv == TRUE)
01155           {
01156             state->setValueOfAtom(predno + 1, true, activate, -1);
01157             p->setTruthValue(true);
01158           }
01159           else
01160           {
01161             state->setValueOfAtom(predno + 1, false, activate, -1);
01162             p->setTruthValue(false);
01163               // Can have unknown truth values when using EM. We want to ignore
01164               // these when performing the counts
01165             if (tv == UNKNOWN)
01166             {
01167               (*unknownPred)[predno] = true;
01168             }
01169           }
01170         }
01171 
01172         state->initMakeBreakCostWatch();
01173         //cout<<"getting true cnts => "<<endl;
01174         state->getNumClauseGndingsWithUnknown(trainTrueCnts_[i], clauseCnt, true,
01175                                               unknownPred);
01176         //cout<<endl;
01177         //cout<<"getting false cnts => "<<endl;
01178         state->getNumClauseGndingsWithUnknown(trainFalseCnts_[i], clauseCnt,
01179                                               false, unknownPred);
01180         delete unknownPred;
01181         if (dldebug)
01182         {
01183           for (int clauseno = 0; clauseno < clauseCnt; clauseno++)
01184           {
01185             cout << clauseno << " : tc = " << trainTrueCnts_[i][clauseno]
01186                  << " ** fc = " << trainFalseCnts_[i][clauseno] << endl;
01187           }
01188         }
01189       }
01190     }
01191 
01192     double tc,fc;
01193     int nonEvidPreds = 0;
01194     cout << "List of CNF Clauses : " << endl;
01195     for (int clauseno = 0; clauseno < clauseCntPerDomain_[0]; clauseno++)
01196     {
01197       tc = 0.0; fc = 0.0;
01198       for (int i = 0; i < domainCnt_; i++) 
01199       {
01200         Domain* domain = (Domain*)inferences_[i]->getState()->getDomain();
01201         Array<bool>& relevantClauses = relevantClausesPerDomain_[i];
01202         Array<double>& logOdds = logOddsPerDomain_[i];
01203       
01204         if (!relevantClauses[clauseno])
01205         {
01206           domain->setNumTrueNonEvidGndings(clauseno, 0);
01207           domain->setNumFalseNonEvidGndings(clauseno, 0);
01208           logOdds[clauseno] = 0.0;
01209           continue;
01210         }
01211         
01212         cout << clauseno << ":";
01213         const Clause* clause =
01214           inferences_[i]->getState()->getMLN()->getClause(clauseno);
01215         clause->print(cout, inferences_[0]->getState()->getDomain());
01216         cout << endl;
01217       
01218         double domainTrueCnts = 0.0;
01219         double domainFalseCnts = 0.0;
01220         if (lazyInference_)
01221         {
01222           domainTrueCnts = totalTrueCnts_[i][clauseno] -
01223                            defaultTrueCnts_[i][clauseno];
01224           domainFalseCnts = totalFalseCnts_[i][clauseno] -
01225                             defaultFalseCnts_[i][clauseno];
01226         }
01227         else
01228         {
01229           domainTrueCnts = trainTrueCnts_[i][clauseno];
01230           domainFalseCnts = trainFalseCnts_[i][clauseno];
01231         }
01232 
01233         tc += domainTrueCnts;
01234         fc += domainFalseCnts;
01235         domain->setNumTrueNonEvidGndings(clauseno, domainTrueCnts);
01236         domain->setNumFalseNonEvidGndings(clauseno, domainFalseCnts);
01237 
01238         if (dldebug)
01239           cout << clauseno << " : tc = " << tc << " ** fc = "<< fc <<endl;      
01240 
01241         for (int i = 0; i < clause->getNumPredicates(); i++)
01242         {
01243           const char* predName = clause->getPredicate(i)->getTemplate()->getName();
01244           if (nonEvidPredNames.contains(predName)) 
01245             nonEvidPreds++;
01246         }
01247       }
01248       
01249       double weight = 0.0;
01250       double totalCnt = tc + fc;
01251 
01252       if (totalCnt == 0 || nonEvidPreds == 0) 
01253       {
01254         //cout << "NOTE: Total count is 0 for clause " << clauseno << endl;
01255         weight = EPSILON;
01256       } 
01257       else 
01258       {
01259         if (fc == 0.0)
01260           fc = 0.00001;
01261         if (tc == 0.0)
01262           tc = 0.00001;
01263 
01264         double priorodds = (pow(2.0, nonEvidPreds) - 1.0)/1.0;
01265         weight = log(tc/fc) - log(priorodds);
01266       }
01267 
01268       for (int i = 0; i < domainCnt_; i++) 
01269       {
01270         Array<double>& logOdds = logOddsPerDomain_[i];
01271         logOdds[clauseno] = weight;
01272       }
01273     }
01274   }
01275 
01276   
01280   void infer() 
01281   {
01282     for (int i = 0; i < domainCnt_; i++) 
01283     {
01284       if (dldebug) cout << "in domain " << i << endl;
01285       VariableState* state = inferences_[i]->getState();
01286       state->setGndClausesWtsToSumOfParentWts();
01287       // MWS: Search is started from state at end of last iteration
01288       state->init();
01289       inferences_[i]->infer();
01290       state->saveLowStateToGndPreds();
01291     }
01292   }
01293 
01298   void fillInMissingValues()
01299   {
01300     assert(withEM_);
01301     cout << "Filling in missing data ..." << endl;
01302       // Get values of initial unknown preds by producing MAP state of
01303       // unknown preds given known evidence and non-evidence preds (VPEM)
01304     Array<Array<TruthValue> > ueValues;
01305     ueValues.growToSize(domainCnt_);
01306     for (int i = 0; i < domainCnt_; i++)
01307     {
01308       VariableState* state = inferences_[i]->getState();
01309       const Domain* domain = state->getDomain();
01310       const GroundPredicateHashArray* knePreds = state->getKnePreds();
01311       const Array<TruthValue>* knePredValues = state->getKnePredValues();
01312 
01313         // Mark known non-evidence preds as evidence
01314       domain->getDB()->setValuesToGivenValues(knePreds, knePredValues);
01315 
01316         // Infer missing values
01317       state->setGndClausesWtsToSumOfParentWts();
01318         // MWS: Search is started from state at end of last iteration
01319       state->init();
01320       inferences_[i]->infer();
01321       state->saveLowStateToGndPreds();
01322 
01323       if (dldebug)
01324       {
01325         cout << "Inferred following values: " << endl;
01326         inferences_[i]->printProbabilities(cout);
01327       }
01328 
01329         // Compute counts
01330       int clauseCnt = clauseCntPerDomain_[i];
01331       state->initMakeBreakCostWatch();
01332       //cout<<"getting true cnts => "<<endl;
01333       const Array<double>* clauseTrueCnts =
01334         inferences_[i]->getClauseTrueCnts();
01335       assert(clauseTrueCnts->size() == clauseCnt);
01336       int numSamples = inferences_[i]->getNumSamples();
01337       for (int j = 0; j < clauseCnt; j++)
01338         trainTrueCnts_[i][j] = (*clauseTrueCnts)[j]/numSamples;
01339 
01340         // Set evidence values back
01341       //assert(uePreds.size() == ueValues[i].size());
01342       //domain->getDB()->setValuesToGivenValues(&uePreds, &ueValues[i]);
01343         // Set non-evidence values to unknown
01344       Array<TruthValue> tmpValues;
01345       tmpValues.growToSize(knePreds->size());
01346       domain->getDB()->setValuesToUnknown(knePreds, &tmpValues);
01347     }
01348     cout << "Done filling in missing data" << endl;    
01349   }
01350 
01351 #if 0
01352   // This was used by the statistical significance convergence test,
01353   // to see if our counts could be drawn from the same distribution
01354   // as the true counts.
01355 
01356   void getCountsForDomain(int* const & clauseTrainCnts, 
01357           int* const & clauseTrainTotal, int* const & clauseInferredCnts, 
01358           int* const & clauseInferredTotal, int domainIdx)
01359   {
01360     Array<bool>& relevantClauses = relevantClausesPerDomain_[domainIdx];
01361     int clauseCnt = clauseCntPerDomain_[domainIdx];
01362     double* trainCnts = NULL;
01363     double* inferredCnts = NULL;
01364     Array<double>& totalTrueCnts = totalTrueCnts_[domainIdx];
01365     //Array<double>& totalFalseCnts = totalFalseCnts_[domainIdx];
01366     Array<double>& defaultTrueCnts = defaultTrueCnts_[domainIdx];    
01367     const MLN* mln = inferences_[domainIdx]->getState()->getMLN();
01368     const Domain* domain = inferences_[domainIdx]->getState()->getDomain();
01369 
01370     memset(clauseTrainCnts, 0, clauseCnt*sizeof(double));
01371     memset(clauseInferredCnts, 0, clauseCnt*sizeof(double));
01372     memset(clauseTrainTotal, 0, clauseCnt*sizeof(double));
01373     memset(clauseInferredTotal, 0, clauseCnt*sizeof(double));
01374 
01375     if (!lazyInference_)
01376     {
01377       if (!inferredCnts) inferredCnts = new double[clauseCnt];
01378 
01379       const Array<double>* clauseTrueCnts =
01380         inferences_[domainIdx]->getClauseTrueCnts();
01381       assert(clauseTrueCnts->size() == clauseCnt);
01382       for (int i = 0; i < clauseCnt; i++)
01383         inferredCnts[i] = (*clauseTrueCnts)[i];
01384 
01385       trainCnts = trainTrueCnts_[domainIdx];
01386     }
01387       //loop over all the training examples
01388     //cout << "\t\ttrain count\t\t\t\tinferred count" << endl << endl;
01389     for (int clauseno = 0; clauseno < clauseCnt; clauseno++) 
01390     {
01391       if (!relevantClauses[clauseno]) continue;
01392       
01393       // Compute total groundings
01394       int totalGndings = (int)(trainFalseCnts_[domainIdx][clauseno] 
01395               + trainTrueCnts_[domainIdx][clauseno]);
01396       clauseTrainTotal[clauseno] += totalGndings;
01397       clauseInferredTotal[clauseno] += totalGndings * 
01398           inferences_[domainIdx]->getNumSamples();
01399       
01400       if (lazyInference_)
01401       {
01402         Clause* clause = (Clause*) mln->getClause(clauseno);
01403 
01404         double trainCnt = totalTrueCnts[clauseno];
01405         double inferredCnt =
01406           clause->getNumTrueGroundings(domain, domain->getDB(), false);
01407         trainCnt -= defaultTrueCnts[clauseno];
01408         inferredCnt -= defaultTrueCnts[clauseno];
01409         clauseTrainCnts[clauseno] += (int)trainCnt;
01410         clauseInferredCnts[clauseno] += (int)inferredCnt;
01411       }
01412       else
01413       {
01414         clauseTrainCnts[clauseno] += (int)trainCnts[clauseno];
01415         clauseInferredCnts[clauseno] += (int)inferredCnts[clauseno];
01416       }
01417       //cout << clauseno << ":\t\t" <<trainCnt<<"\t\t\t\t"<<inferredCnt<<endl;
01418     }
01419 
01420     delete[] inferredCnts;
01421   }
01422 #endif
01423   
01424   // Get the gradient of the negative log likelihood for one domain
01425   void getGradientForDomain(double* const & gradient, const int& domainIdx)
01426   {
01427     Array<bool>& relevantClauses = relevantClausesPerDomain_[domainIdx];
01428     int clauseCnt = clauseCntPerDomain_[domainIdx];
01429     double* trainCnts = NULL;
01430     double* inferredCnts = NULL;
01431     double* clauseTrainCnts = new double[clauseCnt]; 
01432     double* clauseInferredCnts = new double[clauseCnt];
01433     //double trainCnt, inferredCnt;
01434     //Array<double>& totalTrueCnts = totalTrueCnts_[domainIdx];
01435     //Array<double>& defaultTrueCnts = defaultTrueCnts_[domainIdx];    
01436     //const MLN* mln = inferences_[domainIdx]->getState()->getMLN();
01437     //const Domain* domain = inferences_[domainIdx]->getState()->getDomain();
01438 
01439     memset(clauseTrainCnts, 0, clauseCnt*sizeof(double));
01440     memset(clauseInferredCnts, 0, clauseCnt*sizeof(double));
01441 
01442     if (!inferredCnts) inferredCnts = new double[clauseCnt];
01443 
01444     const Array<double>* clauseTrueCnts =
01445       inferences_[domainIdx]->getClauseTrueCnts();
01446     assert(clauseTrueCnts->size() == clauseCnt);
01447     int numSamples = inferences_[domainIdx]->getNumSamples();
01448     for (int i = 0; i < clauseCnt; i++)
01449     {
01450       inferredCnts[i] = (*clauseTrueCnts)[i] / numSamples;
01451     }
01452     if (!lazyInference_) trainCnts = trainTrueCnts_[domainIdx];
01453 
01454     if (dldebug)
01455       cout << "numSamples = " << numSamples << endl;
01456 
01457       //loop over all the training examples
01458     //cout << "\t\ttrain count\t\t\t\tinferred count" << endl << endl;
01459     for (int clauseno = 0; clauseno < clauseCnt; clauseno++) 
01460     {
01461       if (!relevantClauses[clauseno]) continue;
01462       if (lazyInference_)
01463       {
01464         clauseTrainCnts[clauseno] = clauseTrainCnts[clauseno] + 
01465                                     totalTrueCnts_[domainIdx][clauseno] -
01466                                     defaultTrueCnts_[domainIdx][clauseno];
01467         clauseInferredCnts[clauseno] += inferredCnts[clauseno];        
01468       }
01469       else
01470       {
01471         clauseTrainCnts[clauseno] += trainCnts[clauseno];
01472         clauseInferredCnts[clauseno] += inferredCnts[clauseno];
01473       }
01474       //cout << clauseno << ":\t\t" <<trainCnt<<"\t\t\t\t"<<inferredCnt<<endl;
01475     }
01476 
01477     if (dldebug)
01478     {
01479       cout << "Domain " << domainIdx << " net counts : " << endl;
01480       cout << "\t\ttrain count\t\t\t\tinferred count" << endl << endl;
01481     }
01482 
01483     for (int clauseno = 0; clauseno < clauseCnt; clauseno++) 
01484     {
01485       if (!relevantClauses[clauseno]) continue;
01486       
01487       if (dldebug)
01488         cout << clauseno << ":\t\t" << clauseTrainCnts[clauseno] << "\t\t\t\t"
01489              << clauseInferredCnts[clauseno] << endl;
01490         // NOTE: rescaling has been implemented to affect the gradient, 
01491         // but not the Hessian.
01492       if (rescaleGradient_ && clauseTrainCnts[clauseno] > 0)
01493       {
01494         gradient[clauseno] -= 
01495           (clauseTrainCnts[clauseno] - clauseInferredCnts[clauseno])
01496             / clauseTrainCnts[clauseno];
01497       }
01498       else
01499       {
01500         gradient[clauseno] -= clauseTrainCnts[clauseno] - 
01501                             clauseInferredCnts[clauseno];
01502       }
01503     }
01504 
01505     delete[] inferredCnts;
01506     delete[] clauseTrainCnts;
01507     delete[] clauseInferredCnts;
01508   }
01509 
01510 
01511     // Get the gradient of the negative log likelihood
01512   void getGradient(double* const & weights, double* const & gradient,
01513                    const int numWts) 
01514   {
01515       // Compute the gradient
01516     memset(gradient, 0, numWts*sizeof(double));
01517 
01518       // There is one DB or the clauses of multiple DBs line up
01519     if (idxTrans_ == NULL)
01520     {
01521       for (int i = 0; i < domainCnt_; i++) 
01522       {           
01523         //cout << "For domain number " << i << endl << endl; 
01524         getGradientForDomain(gradient, i);        
01525       }
01526     }
01527     else
01528     {
01529         // The clauses for multiple databases do not line up
01530       Array<Array<double> >* gradsPerDomain = idxTrans_->getGradsPerDomain();
01531       const Array<Array<Array<IdxDiv>*> >* cIdxToCFIdxsPerDomain 
01532         = idxTrans_->getClauseIdxToClauseFormulaIdxsPerDomain();
01533      
01534       for (int i = 0; i < domainCnt_; i++) 
01535       {           
01536         //cout << "For domain number " << i << endl << endl; 
01537 
01538         Array<double>& grads = (*gradsPerDomain)[i];
01539         memset((double*)grads.getItems(), 0, grads.size()*sizeof(double));
01540         
01541         getGradientForDomain((double*)grads.getItems(), i);
01542         
01543           // map clause gradient to clause/formula gradients
01544         assert(grads.size() == clauseCntPerDomain_[i]);
01545         for (int j = 0; j < grads.size(); j++)
01546         {
01547           Array<IdxDiv>* idxDivs = (*cIdxToCFIdxsPerDomain)[i][j];          
01548           for (int k = 0; k < idxDivs->size(); k++)
01549             gradient[ (*idxDivs)[k].idx ] += grads[j] / (*idxDivs)[k].div;
01550         }
01551       }
01552     }
01553 
01554       // Add the deriative of the prior 
01555     if (usePrior_) 
01556     {
01557           for (int i = 0; i < numWts; i++) 
01558       {
01559         //if (!relevantClausesFormulas_[i]) continue;
01560         double sd = priorStdDevs_[i];
01561         double priorDerivative = (weights[i]-priorMeans_[i])/(sd*sd);
01562         //cout << i << " : " << "gradient : " << gradient[i]
01563         //     << "  prior gradient : " << priorDerivative;
01564         gradient[i] += priorDerivative; 
01565             //cout << "  net gradient : " << gradient[i] << endl; 
01566       }
01567     }
01568   }
01569 
01570  public:
01571     // Different gradient descent methods
01572     // SIMPLE is Voted Perceptron
01573   const static int SIMPLE = 0;
01574   const static int DN = 2;
01575   const static int CG = 3;
01576 
01577 
01578  private:
01579   int domainCnt_;
01580   Array<Array<double> > logOddsPerDomain_;
01581   Array<int> clauseCntPerDomain_;
01582 
01583         // Used in lazy version
01584   Array<Array<double> > totalTrueCnts_; 
01585   Array<Array<double> > totalFalseCnts_; 
01586   Array<Array<double> > defaultTrueCnts_;
01587   Array<Array<double> > defaultFalseCnts_;
01588 
01589   Array<Array<bool> > relevantClausesPerDomain_;
01590   Array<bool> relevantClausesFormulas_;
01591 
01592         // Used to compute cnts from mrf
01593   Array<double*> trainTrueCnts_;
01594   Array<double*> trainFalseCnts_;
01595       
01596   bool usePrior_;
01597   const double* priorMeans_, * priorStdDevs_; 
01598 
01599   IndexTranslator* idxTrans_; //not owned by object; don't delete
01600   
01601   bool lazyInference_;
01602   bool isQueryEvidence_;
01603   bool rescaleGradient_;
01604   int  method_;
01605   double cg_lambda_;
01606   bool preconditionCG_;
01607 
01608   int maxBacktracks_;
01609   int backtrackCount_;
01610   double cg_max_lambda_;
01611 
01612   Array<Inference*> inferences_;
01613   
01614     // Using EM to fill in missing values?
01615   bool withEM_;
01616 };
01617 
01618 
01619 #endif

Generated on Sun Jun 7 11:55:14 2009 for Alchemy by  doxygen 1.5.1