structlearn.cpp

00001 #include "structlearn.h"
00002 
00004 
00005 struct ExistFormula
00006 {
00007   ExistFormula(const string& fformula) 
00008     : formula(fformula), gain(0), wt(0), newScore(0), numPreds(0) {}
00009   ~ExistFormula() 
00010   { 
00011     for (int i = 0; i < cnfClausesForDomains.size(); i++)
00012       cnfClausesForDomains[i].deleteItemsAndClear();
00013   }
00014 
00015   string formula;
00016     //cnfClausesForDomains[d] are the CNF clauses formed with the constants 
00017     //in domain d
00018   Array<Array<Clause*> > cnfClausesForDomains;
00019   double gain; 
00020   double wt;
00021   Array<double> wts;
00022   double newScore;
00023   int numPreds;
00024 };
00025 
00026 
00027 void StructLearn::deleteExistFormulas(Array<ExistFormula*>& existFormulas)
00028 { existFormulas.deleteItemsAndClear(); }
00029 
00030 
00031 void StructLearn::getMLNClauses(Array<Clause*>& initialMLNClauses,
00032                                 Array<ExistFormula*>& existFormulas)
00033 {
00034   for (int i = 0; i < mln0_->getNumClauses(); i++)
00035     if (isModifiableClause(i)) 
00036     {
00037       Clause* c = (Clause*) mln0_->getClause(i);
00038       initialMLNClauses.append(new Clause(*c));
00039     }
00040 
00041   const FormulaAndClausesArray* fnca = mln0_->getFormulaAndClausesArray();
00042   for (int i = 0; i < fnca->size(); i++)
00043     if (isNonModifiableFormula((*fnca)[i]))
00044       existFormulas.append(new ExistFormula((*fnca)[i]->formula));
00045 
00046   for (int i = 0; i < existFormulas.size(); i++)
00047   {
00048     string formula = existFormulas[i]->formula;      
00049     FormulaAndClauses tmp(formula, 0, false);
00050     const FormulaAndClausesArray* fnca
00051       = (*mlns_)[0]->getFormulaAndClausesArray();
00052     int a = fnca->find(&tmp);            
00053     existFormulas[i]->numPreds = (*fnca)[a]->numPreds;
00054 
00055     Array<Array<Clause*> >& cnfClausesForDomains 
00056       = existFormulas[i]->cnfClausesForDomains;
00057     cnfClausesForDomains.growToSize(mlns_->size());
00058     for (int j = 0; j < mlns_->size(); j++)
00059     {
00060       Array<Clause*>& cnfClauses = cnfClausesForDomains[j];
00061       fnca = (*mlns_)[j]->getFormulaAndClausesArray();
00062       a = fnca->find(&tmp);
00063       assert(a >= 0);
00064       IndexClauseHashArray* indexClauses = (*fnca)[a]->indexClauses;
00065       for (int k = 0; k < indexClauses->size(); k++)
00066       {
00067         Clause* c = new Clause(*((*indexClauses)[k]->clause));
00068         c->newAuxClauseData();
00069         cnfClauses.append(c);
00070       }
00071       cnfClauses.compress();        
00072     }
00073   }
00074 }
00075 
00076 
00077   //cnfClauses enter & exit function with its AuxClauseData's cache == NULL
00078 inline
00079 void StructLearn::pllCountsForExistFormula(Clause* cnfClause, 
00080                                            const int& domainIdx,
00081                                            int* clauseIdxInMln,
00082                                            Array<UndoInfo*>* const & undoInfos)
00083 {
00084   assert(cnfClause->getAuxClauseData()->cache == NULL);
00085   bool inCache = false; 
00086   bool hasDomainCounts = false; 
00087   double prevCNFClauseSize = 0;
00088 
00089   if (cacheClauses_)
00090   {
00091     int i; 
00092     if ((i = cachedClauses_->find(cnfClause)) >= 0) //if clause is in cache
00093     {
00094       inCache = true;
00095       Array<Array<Array<CacheCount*>*>*>* cache =
00096         (*cachedClauses_)[i]->getAuxClauseData()->cache;
00097       Array<Array<CacheCount*>*>*  domainCache = (*cache)[domainIdx];
00098       for (int j = 0; j < domainCache->size(); j++)
00099         if ((*domainCache)[j] != NULL) { hasDomainCounts = true; break; }
00100     }
00101      
00102     if (hasDomainCounts) // if clause and counts for domain are cached
00103     {
00104       pll_->insertCounts(clauseIdxInMln, undoInfos,
00105                          (*cachedClauses_)[i]->getAuxClauseData()->cache,
00106                          domainIdx);
00107       return;
00108     }
00109     else
00110     {
00111       if (cacheSizeMB_ <  maxCacheSizeMB_)
00112       {
00113         if (inCache) //if clause is in cache but not counts for domain
00114         {
00115           cnfClause->getAuxClauseData()->cache 
00116             = (*cachedClauses_)[i]->getAuxClauseData()->cache;
00117           prevCNFClauseSize = cnfClause->sizeMB();
00118         }
00119         else
00120           cnfClause->newCache(domains_->size(),
00121                               (*domains_)[0]->getNumPredicates());
00122       }
00123       else
00124       {
00125         static bool printCacheFull = true;
00126         if (printCacheFull)
00127         {
00128           cout << "Cache is full, approximate size = " << cacheSizeMB_ 
00129                <<" MB" << endl;
00130           printCacheFull = false;
00131         }
00132       }
00133     }
00134   }
00135     
00136   //we do not have the counts
00137 
00138   pll_->computeCountsForNewAppendedClause(cnfClause,clauseIdxInMln,domainIdx,
00139                                           undoInfos, sampleClauses_, 
00140                                           cnfClause->getAuxClauseData()->cache);
00141 
00142   if (cnfClause->getAuxClauseData()->cache)
00143   {
00144     if (inCache)
00145     {
00146       cacheSizeMB_ += cnfClause->sizeMB() - prevCNFClauseSize;
00147       cnfClause->getAuxClauseData()->cache = NULL; 
00148     }
00149     else
00150     {
00151       if (cacheSizeMB_ < maxCacheSizeMB_)
00152       {
00153         cacheSizeMB_ += cnfClause->sizeMB();
00154         Array<Array<Array<CacheCount*>*>*>* cache 
00155           = cnfClause->getAuxClauseData()->cache;
00156         cnfClause->getAuxClauseData()->cache = NULL;
00157         Clause* copyClause = new Clause(*cnfClause);
00158         copyClause->getAuxClauseData()->cache = cache;
00159         copyClause->compress();
00160         cachedClauses_->append(copyClause);
00161       }
00162       else
00163       {
00164         cnfClause->getAuxClauseData()->deleteCache();
00165         cnfClause->getAuxClauseData()->cache = NULL; 
00166       }
00167     }
00168   }
00169 }
00170 
00171 
00172 inline
00173 void StructLearn::printNewScore(const string existFormStr,const int& lbfgsbIter,
00174                                 const double& lbfgsbSec, const double& newScore,
00175                                 const double& gain, const double& penalty, 
00176                                 const double& wt)
00177 {
00178   cout << "*************************** " << candCnt_++ << ", iter " << iter_ 
00179        << ", beam search iter " << bsiter_ << endl;
00180 
00181   cout << "exist formula : " << existFormStr << endl;
00182   cout << "op            : OP_ADD" << endl; 
00183   cout << "score    : " << newScore << endl;
00184   cout << "gain     : " << gain << endl;
00185   cout << "penalty  : " << penalty << endl;
00186   cout << "net gain : " << gain - penalty << endl;
00187   cout << "wt       : " << wt << endl;
00188   cout << "num LBFGSB iter      = " << lbfgsbIter << endl;
00189   cout << "time taken by LBFGSB = "; Timer::printTime(cout, lbfgsbSec);
00190   cout << endl;
00191   cout << "*************************** " << endl << endl;;
00192 }
00193 
00194 
00195 inline
00196 void StructLearn::rquicksort(Array<ExistFormula*>& existFormulas, const int& l, 
00197                              const int& r)  
00198 {
00199   ExistFormula** items = (ExistFormula**) existFormulas.getItems();
00200   if (l >= r) return;
00201   ExistFormula* tmp = items[l];
00202   items[l] = items[(l+r)/2];
00203   items[(l+r)/2] = tmp;
00204 
00205   int last = l;
00206   for (int i = l+1; i <= r; i++)
00207     if (items[i]->gain > items[l]->gain)
00208     {
00209       ++last;
00210       ExistFormula* tmp = items[last];
00211       items[last] = items[i];
00212       items[i] = tmp;
00213     }
00214     
00215   tmp = items[l];
00216   items[l] = items[last];
00217   items[last] = tmp;
00218   rquicksort(existFormulas, l, last-1);
00219   rquicksort(existFormulas, last+1, r); 
00220 }
00221 
00222 
00223 inline
00224 void StructLearn::rquicksort(Array<ExistFormula*>& ef) 
00225 { rquicksort(ef, 0, ef.size()-1); }
00226 
00227 
00228   // Compute counts for formula, and learn optimal weights and gain.
00229   // If we are computing counts only, then the counts are added permanently into
00230   // PseudoLogLikelihood.
00231 inline
00232 void StructLearn::evaluateExistFormula(ExistFormula* const& ef, 
00233                                        const bool& computeCountsOnly,
00234               Array<Array<Array<IndexAndCount*> > >* const & iacArraysPerDomain,
00235                                        const double& prevScore)
00236 {
00237   bool undo = !computeCountsOnly;
00238   bool evalGainLearnWts = !computeCountsOnly;
00239 
00240   Array<int*> idxPtrs; //for clean up
00241   Array<UndoInfo*>* undoInfos = (undo) ? new Array<UndoInfo*> : NULL;
00242 
00243   //compute counts for the CNF clauses
00244 
00245   Array<Array<Clause*> >& cnfClausesForDomains = ef->cnfClausesForDomains;
00246   for (int d = 0; d < cnfClausesForDomains.size(); d++)
00247   {
00248     Array<Clause*>& cnfClauses = cnfClausesForDomains[d];    
00249     MLN* mln = (*mlns_)[d];
00250     for (int k = 0; k < cnfClauses.size(); k++)
00251     {
00252       Clause* c = cnfClauses[k];
00253         //if we adding counts permanently, then don't add counts that were added
00254         //previously; otherwise we can add the duplicate counts because they 
00255         //will be undone later.
00256       if (!undo && mln->containsClause(c)) continue;
00257       int* tmpClauseIdxInMLN = new int(mln->getNumClauses() + k);
00258       idxPtrs.append(tmpClauseIdxInMLN);
00259       if (iacArraysPerDomain)
00260       {
00261         Array<Array<IndexAndCount*> >& iacArrays = (*iacArraysPerDomain)[d];
00262         assert(iacArrays.size() == cnfClauses.size());
00263         Array<IndexAndCount*>& iacArray = iacArrays[k];
00264 
00265         Array<UndoInfo*> tmpUndoInfos;
00266         pllCountsForExistFormula(c, d, tmpClauseIdxInMLN, &tmpUndoInfos);
00267         for (int i = 0; i < tmpUndoInfos.size(); i++)
00268           iacArray.append(tmpUndoInfos[i]->affectedArr->lastItem());
00269         if (undoInfos) undoInfos->append(tmpUndoInfos);
00270         else           tmpUndoInfos.deleteItemsAndClear();
00271       }
00272       else
00273         pllCountsForExistFormula(c, d, tmpClauseIdxInMLN, undoInfos);
00274     }
00275   }
00276 
00277 
00278   //find optimal weights and gain of formula 
00279 
00280   if (evalGainLearnWts)
00281   {
00282     Array<double> priorMeans, priorStdDevs;    
00283         //either add one existentially quantified formula (when clauses do not
00284         //line up perfectly across databases) or add all CNF clauses (when
00285         //they do)
00286     int numAdded = (indexTrans_) ? 1 : cnfClausesForDomains[0].size();
00287     setPriorMeansStdDevs(priorMeans, priorStdDevs, numAdded, NULL);
00288 
00289     if (hasPrior_) 
00290       pll_->setMeansStdDevs(priorMeans.size(), priorMeans.getItems(), 
00291                             priorStdDevs.getItems());
00292     else           
00293       pll_->setMeansStdDevs(-1, NULL, NULL);
00294 
00295     if (indexTrans_)
00296     {
00297       for (int d = 0; d < cnfClausesForDomains.size(); d++)
00298       {
00299         int numCNFClauses = cnfClausesForDomains[d].size();
00300         indexTrans_->appendClauseIdxToClauseFormulaIdxs(1, numCNFClauses);
00301       }
00302     }
00303       
00304     int iter; bool error; double elapsedSec; 
00305     int numClausesFormulas = getNumClausesFormulas();
00306     Array<double>* wts = &(ef->wts);
00307     wts->growToSize(numClausesFormulas + numAdded + 1);
00308 
00309     double newScore = maximizeScore(numClausesFormulas, numAdded, wts, 
00310                                     NULL, NULL, iter, error, elapsedSec);
00311 
00312     if (indexTrans_)
00313     {
00314       for (int d = 0; d < cnfClausesForDomains.size(); d++)
00315       {
00316         int numCNFClauses = cnfClausesForDomains[d].size();
00317         indexTrans_->removeClauseIdxToClauseFormulaIdxs(1, numCNFClauses);
00318       }
00319     }
00320 
00321     double totalWt = 0;
00322     for (int i = 0; i < numAdded; i++)
00323       totalWt += (*wts)[numClausesFormulas+i+1];
00324 
00325     double penalty = ef->numPreds * penalty_;
00326       
00327     ef->gain = newScore-prevScore-penalty;
00328     ef->wt = totalWt;
00329     ef->newScore = newScore;
00330 
00331     if (error) {newScore=prevScore;cout<<"LBFGSB failed to find wts"<<endl;}
00332     printNewScore(ef->formula, iter, elapsedSec, newScore, 
00333                   newScore-prevScore, penalty, totalWt);
00334   }
00335     
00336   if (undo) { pll_->undoAppendRemoveCounts(undoInfos); delete undoInfos; }
00337   idxPtrs.deleteItemsAndClear();
00338 }//evaluateExistFormula()
00339 
00340 
00341 
00342 double StructLearn::evaluateExistFormulas(Array<ExistFormula*>& existFormulas, 
00343                                        Array<ExistFormula*>& highGainWtFormulas,
00344                                           const double& prevScore)
00345 {
00346   if (existFormulas.size() == 0) return 0;
00347   for (int i = 0; i < existFormulas.size(); i++)
00348     evaluateExistFormula(existFormulas[i], false, NULL, prevScore);
00349     
00350   Array<ExistFormula*> tmp(existFormulas);
00351   rquicksort(tmp);
00352   double minGain = DBL_MAX;
00353   cout << "evaluated existential formulas " << endl;
00354   cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << endl;  
00355   for (int i = 0; i < tmp.size(); i++)
00356   {
00357     existFormulas[i] = tmp[i];
00358     double gain = existFormulas[i]->gain;
00359     double wt = existFormulas[i]->wt;
00360     cout << i << "\t" << existFormulas[i]->formula << endl
00361          << "\tgain = " << gain << ",  wt = " << wt << ",  op = OP_ADD" 
00362          << endl;
00363     if (gain > minGain_ && wt >= minWt_) 
00364     {
00365       highGainWtFormulas.append(tmp[i]);
00366       if (gain < minGain)  minGain = gain;
00367     }
00368   }
00369   cout << "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" << endl << endl;
00370 
00371   if (minGain == DBL_MAX) minGain = 0;
00372   return minGain;
00373 } 
00374 
00375 
00376 inline
00377 void StructLearn::appendExistFormulaToMLNs(ExistFormula* const & ef)
00378 {
00379   Array<Array<Array<IndexAndCount*> > > iacsPerDomain;
00380   Array<Array<Clause*> >& cnfClausesForDomains = ef->cnfClausesForDomains;
00381   iacsPerDomain.growToSize(cnfClausesForDomains.size());
00382   for (int d = 0; d < cnfClausesForDomains.size(); d++)
00383   {
00384     Array<Array<IndexAndCount*> >& iacArrays = iacsPerDomain[d];
00385     iacArrays.growToSize( cnfClausesForDomains[d].size() );
00386   }
00387   evaluateExistFormula(ef, true, &iacsPerDomain, 0);
00388 
00389   Array<double>& wts = ef->wts;
00390   int numClausesFormulas = getNumClausesFormulas();
00391 
00392     //update weights before CNF clauses are added as duplicate clause wts are
00393     //accumulated as they are added.
00394   if (indexTrans_ == NULL) updateWts(wts, NULL, NULL);
00395 
00396     //append CNF clauses to MLN
00397   for (int d = 0; d < cnfClausesForDomains.size(); d++)
00398   {
00399     MLN* mln = (*mlns_)[d];
00400     Array<Array<IndexAndCount*> >& iacArrays = iacsPerDomain[d];
00401     Array<Clause*>& cnfClauses = cnfClausesForDomains[d];
00402 
00403     for (int i = 0; i < cnfClauses.size(); i++)
00404     {
00405       int idx;
00406         //when we are learning the weight of a formula (i.e. indexsTrans !=NULL)
00407         //its CNF clause weight don't matter, so set them to 0.
00408       double wt = (indexTrans_) ?  0 : wts[numClausesFormulas+i+1];
00409         //if cnfClauses[i] is already in MLN, its weights will be correctly set
00410         //in updateWts() later
00411       mln->appendClause(ef->formula, true, new Clause(*cnfClauses[i]),
00412                         wt, false, idx);
00413       mln->setFormulaPriorMean(ef->formula, priorMean_);
00414       ((MLNClauseInfo*)mln->getMLNClauseInfo(idx))->priorMean 
00415         += priorMean_/cnfClauses.size();
00416 
00417       int* idxPtr = mln->getMLNClauseInfoIndexPtr(idx);
00418       Array<IndexAndCount*>& iacs = iacArrays[i]; 
00419       for (int j = 0; j < iacs.size(); j++)  iacs[j]->index = idxPtr;
00420     }
00421   }
00422   assert(pll_->checkNoRepeatedIndex());
00423 
00424 
00425   if (indexTrans_)
00426   {
00427        //update weights after the formula has been added to the MLN
00428     Array<string> appendedFormula; 
00429     appendedFormula.append(ef->formula);
00430     updateWts(wts, NULL, &appendedFormula);
00431       //MLNs has changed, the index translation must be recreated
00432     indexTrans_->createClauseIdxToClauseFormulaIdxsMap();
00433   }
00434 
00435 
00436   cout << "Modified MLN: Appended formula to MLN: " << ef->formula << endl;
00437 }
00438 
00439 
00440 inline
00441 bool StructLearn::effectExistFormulaOnMLNs(ExistFormula* ef, 
00442                                            Array<ExistFormula*>& existFormulas, 
00443                                            double& score)
00444 {
00445   cout << "effecting existentially quantified formula " << ef->formula 
00446        << " on MLN..." << endl;
00447   appendExistFormulaToMLNs(ef);
00448   score = ef->newScore;
00449   printMLNClausesWithWeightsAndScore(score, iter_); 
00450   printMLNToFile(NULL, iter_);
00451 
00452   int r = existFormulas.find(ef);
00453   assert(r >= 0);
00454   ExistFormula* rf = existFormulas.removeItemFastDisorder(r);
00455   assert(rf == ef);
00456   delete rf;
00457   return true;
00458 }
00459 
00460 
00461 bool StructLearn::effectBestCandidateOnMLNs(Array<Clause*>& bestCandidates, 
00462                                             Array<ExistFormula*>& existFormulas,
00463                                        Array<ExistFormula*>& highGainWtFormulas,
00464                                             double& score)
00465 {
00466   cout << "effecting best candidate among existential formulas and "
00467        << "best candidates on MLN..." << endl << endl;
00468 
00469   int a = 0, b = 0;
00470   bool ok = false;
00471   int numCands = bestCandidates.size() + highGainWtFormulas.size();
00472   for (int i = 0; i < numCands; i++)
00473   {
00474     if (a >= bestCandidates.size())
00475     {
00476       if (ok=effectExistFormulaOnMLNs(highGainWtFormulas[b++], 
00477                                       existFormulas, score)) break;
00478     }
00479     else
00480     if (b >= highGainWtFormulas.size())
00481     {
00482       cout << "effecting best candidate " << a << " on MLN..." << endl;
00483       if (ok=effectBestCandidateOnMLNs(bestCandidates[a++], score)) break;
00484       cout << "failed to effect candidate on MLN." << endl;
00485       delete bestCandidates[a-1];
00486     }
00487     else
00488     if (highGainWtFormulas[b]->gain > 
00489         bestCandidates[a]->getAuxClauseData()->gain)
00490     {
00491       if (ok=effectExistFormulaOnMLNs(highGainWtFormulas[b++], 
00492                                       existFormulas, score)) break;
00493     }
00494     else
00495     {
00496       cout << "effecting best candidate " << a << " on MLN..." << endl;
00497       if (ok=effectBestCandidateOnMLNs(bestCandidates[a++], score)) break;
00498       cout << "failed to effect candidate on MLN." << endl;
00499       delete bestCandidates[a-1];
00500     }
00501   }
00502   
00503   for (int i = a; i < bestCandidates.size(); i++) delete bestCandidates[i];
00504   return ok;
00505 }

Generated on Tue Jan 16 05:30:03 2007 for Alchemy by  doxygen 1.5.1