?? dci.cc
字號:
// Copyright (C) 2003 salvatore orlando <salvatore.orlando@unive.it>// // This program is free software; you can redistribute it and/or modify// it under the terms of the GNU General Public License as published by// the Free Software Foundation; either version 2 of the License, or// (at your option) any later version.// // This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the// GNU General Public License for more details.// // You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.#include <iostream>#include <vector>#include <cmath>using namespace std;#include <stdio.h>#include "items.hh"#include "transaction.hh"#include "database.hh"#include "candidates.hh"#include "frequents.hh"#include "utils.hh"#include "direct_count.hh"#include "vertical.hh"#define TEMPORARY_DATASET "dataset.tmp"#define PRUNE_FACTOR_PROJ 0.04#define PRUNE_FACTOR_NEWLIST 0.8#define PRUNE_FACTOR_COMPLEXITY 2char OUTF[128] = ""; // output filebool write_output; // dump frequent itemsets to file ? int first_scan(Data *d, dci_items& counters, unsigned int& max_trans_len);template <class T, class T1>void following_scans(int max_trans_len, dci_items& counters);template <class T, class T1>set_of_frequents<T,T1> *second_scan(int max_trans_len, dci_items& counters, cand_2_iter<T1>& c, DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCP_iter(int iter, int max_trans_len, dci_items& counters, DCP_candidates<T,T1>& c, set_of_frequents<T,T1>& next_freq, DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_diffuse(int iter, dci_items& counters, set_of_frequents<T,T1>& freq, set_of_frequents<T,T1>& next_freq, DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_diffuse_key(int iter, dci_items& counters, set_of_frequents<T,T1>& freq, set_of_frequents<T,T1>& next_freq, DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_compact(int iter, dci_items& counters, set_of_frequents<T,T1>& freq, set_of_frequents<T,T1>& next_freq, DCI_vertical_dataset<T>& DCI_dataset);template <class T, class T1>void DCI_iter_compact_key(int iter, dci_items& counters, set_of_frequents<T,T1>& freq, set_of_frequents<T,T1>& next_freq, DCI_vertical_dataset<T>& DCI_dataset);int main(int argc, char ** argv){ Chronos all_time; all_time.StartChronos(); if(argc < 3 || argc > 4) { cerr << "Usage: " << argv[0] << " input_file min_count [output_file]" << endl; return 1; } unsigned int min_count; min_count = atoi(argv[2]); if (min_count <= 0) { cerr << "Usage: " << argv[0] << " input_file min_count" << endl; cerr << " Value of min_count not allowed (min_count > 0)\n"; return 2; } dci_items counters(min_count); // counters for singleton Data *d; // database object used only for the d = new Data(argv[1]); // first scan if (!d->isOpen()) { cerr << "input file " << argv[1] << " cannot be opened!" << endl; exit(1); } if (argc == 4) { sprintf(OUTF, "%s", argv[3]); write_output = true; } else write_output = false; // ************************************************** // First iteration // ************************************************** unsigned int max_support; // the same as nr_of_trans unsigned int max_trans_len; max_support = first_scan(d, counters, max_trans_len); // m1 is the number of items (singleton) // that are found to be frequent int m1 = counters.get_m1(); delete d; if (max_support == 0 || m1 < 2) { // there is nothing to mine... cout << "Total time: " << all_time.ReadChronos() << endl; return 0; } // ************************************************** // Second and Following iterations // since we now know how many distinct items are frequent // and their maximum support, we can optimize the amount // of memory used to store itemsets and their counters // ************************************************** m1++; // we need one extra element to store the key-pattern flag (-1) if (m1 < 256 && max_support < 256) following_scans<unsigned char, unsigned char>(max_trans_len, counters); else if (m1 < 256 && max_support < 256*256) following_scans<unsigned char, unsigned short int>(max_trans_len, counters); else if (m1 < 256 && max_support >= 256*256) following_scans<unsigned char, unsigned int>(max_trans_len, counters); else if (m1 < 256*256 && max_support < 256) following_scans<unsigned short int, unsigned char>(max_trans_len, counters); else if (m1 >= 256*256 && max_support < 256) following_scans<unsigned int, unsigned char>(max_trans_len, counters); else if (m1 < 256*256 && max_support < 256*256) following_scans<unsigned short int, unsigned short int>(max_trans_len, counters); else if (m1 < 256*256 && max_support >= 256*256) following_scans<unsigned short int, unsigned int>(max_trans_len, counters); else if (m1 >= 256*256 && max_support < 256*256) following_scans<unsigned int, unsigned short int>(max_trans_len, counters); else following_scans<unsigned int, unsigned int>(max_trans_len, counters);#ifdef VERBOSE cout << "Total time: " << all_time.ReadChronos() << endl;#endif unlink(TEMPORARY_DATASET); return 0;}// First iteration: counting frequent items int first_scan(Data *d, dci_items& counters, unsigned int& max_trans_len){ Chronos time; time.StartChronos(); float min_supp; bool create=true; // create a temporary binary representation of the dataset // on disk. we will keep writing on the same file during // all subsequent scans binFSout<unsigned int> oo(TEMPORARY_DATASET, create); if(!oo.isOpen()) { cerr << TEMPORARY_DATASET << " could not be opened!" << endl; exit(1); } int totalsize=0; max_trans_len = 0; vector<unsigned int> t; // counting loop while(d->getNextTransaction(t) != 0) { // read next transaction t from db for(unsigned int i=0; i<t.size(); i++) // for each item in t ... counters.insert_item(t[i]); // ... increase corresp. counter if (t.size() > max_trans_len) // keep track of max trans length max_trans_len = t.size(); oo.writeTransaction(t); // write t to temp file totalsize += t.size(); // temp file size } min_supp = 100.0 * (double) counters.min_count / (double) oo.get_num_of_trans(); counters.set_num_of_trans(oo.get_num_of_trans()); // remap item identifiers to 1:m1 int first_freq = counters.remap_items(write_output); if (write_output) { // dump frequents to file FSout o(OUTF, 1); // Iter=1 if(!o.isOpen()) { cerr << OUTF << " could not be opened for writing!" << endl; exit(1); } char count_print[32]; // write empty itemset int num_written = sprintf(count_print, "(%d)\n", oo.get_num_of_trans()); o.printSet(count_print, num_written); // output frequent items for (unsigned int i=first_freq; i < counters.get_m(); i++) { o.printSet(counters.unmap_ascii[i-first_freq], counters.unmap_ascii_len[i-first_freq]); num_written = sprintf(count_print, "(%d)\n", (int) counters.get_count(i)); o.printSet(count_print, num_written); // print for each suffix[i] } } counters.delete_counts();#ifdef VERBOSE cout << "# Database statistics:\n#\t" << oo.get_num_of_trans() <<" transactions\n#\t" << counters.get_m() << " different items\n#\t" << "Max transaction length = " << max_trans_len <<"\n#\n# "; cout << "Min support = " << min_supp << "% " << "Min count = " << counters.min_count << endl << endl; print_statistics("DCP", 1, counters.get_m(), counters.get_m1(), time.ReadChronos());#else printf("1\n%d\n",counters.get_m1()); // 1 is the empty set#endif return(counters.get_max_supp());}// second and following iterations: // the second with direct count, the others with DCP or DCItemplate <class T, class T1>void following_scans(int max_trans_len, dci_items& counters){ set_of_frequents<T, T1> *freq; // frequent itemsets int k=2; // freq. itemset size DCI_vertical_dataset<T> DCI_dataset; // vertical dataset bool DCI = false; // DCI or DCP? unsigned int max_count; // Check if the in-core vertical dataset VD can be created if (DCI_dataset.VD_can_be_allocated(counters.get_m1(), counters.get_num_of_trans())) { DCI_dataset.init_VD(); DCI = true; } max_count = counters.get_max_supp(); // ------------------------------------ // Second iteration using a prefix table // ------------------------------------ cand_2_iter<T1> *c; // candidates for 2nd iteration c = new cand_2_iter<T1>(counters.get_m1()); freq = second_scan<T, T1>(max_trans_len, counters, *c, DCI_dataset); if (freq == NULL) return; if (freq->get_num_frequents() < 3) { // there is nothing else to mine ... delete freq; return; } // ------------------------------------ // third and following iterations // ------------------------------------ DCP_candidates<T,T1> *cand; cand = new DCP_candidates<T,T1>(k); // allocate cand the first time // Iterate with DCP until DCI can start while (!DCI) { k++; // increment iter counter // Check if the in-core vertical dataset VD can be created if (DCI_dataset.VD_can_be_allocated(counters.get_mk(), counters.get_num_of_trans())) { // next iter with DCI DCI_dataset.init_VD(); DCI = true; } gen_candidates<T, T1>(*freq, *cand, counters, k, *c); if (cand->get_num_candidates() == 0) { cout << "no more candidates !\n"; return; } if (k==3) delete c; DCP_iter<T, T1>(k, max_trans_len, counters, *cand, *freq, DCI_dataset); if (freq->get_num_frequents() < k+1) { // nothing to mine ... delete cand; delete freq; return; } } delete cand; // DCI doesn't use candidates // --------------------------------------------- // Iterations with DCI, i.e. using vertical dataset // and intersections. // --------------------------------------------- // check the vertical dataset to choose between // sparse or dense optimizations // ALSO reorder the columns of the vertical dataset!! DCI_dataset.chk_compact_vertical(counters.get_mk()); // initialize key-pattern flags freq->init_keys(); // create a set of frequent itemsets set_of_frequents<T, T1> *next_freq = new set_of_frequents<T, T1>(k); set_of_frequents<T, T1> *tmp_freq; // pointer used for swapping // decide if it is convenient to use the // key-pattern optimization or not bool use_keys = counters.use_key_patterns(); while(1) { k++; // increment iter counter if (use_keys) { // few key patterns are expected // it is faster to try to infer itemsets // supports from non-key patterns DCI_iter_compact_key<T, T1>(k, counters, *freq, *next_freq, DCI_dataset); } else { // too many key patterns are expected // it is faster to *count* itemset supports // but we can still check if the dataset // is compact or not. if (DCI_dataset.is_compact()) DCI_iter_compact<T, T1>(k, counters, *freq, *next_freq, DCI_dataset); else DCI_iter_diffuse<T, T1>(k, counters, *freq, *next_freq, DCI_dataset); } if (next_freq->get_num_frequents() < k) { // nothing to mine delete freq; delete next_freq; return; } else { // swap sets of frequent itemsets tmp_freq = freq; freq = next_freq; next_freq = tmp_freq; } } }template <class T, class T1>set_of_frequents<T,T1> *second_scan(int max_trans_len, dci_items& counters, cand_2_iter<T1>& c, DCI_vertical_dataset<T>& DCI_dataset) // Second iteration with direct count of 2-itemsets. // If possible (enough memory) builds VD on the fly // for third and subsequent iterations{ Chronos time; time.StartChronos(); binFSin<unsigned int> ii(TEMPORARY_DATASET); if(!ii.isOpen()) { cerr << TEMPORARY_DATASET << " could not be opened!" << endl; exit(1); } bool create=false; binFSout<T> oo(TEMPORARY_DATASET, create); if(!oo.isOpen()) { cerr << TEMPORARY_DATASET << " could not be opened!" << endl; exit(1); } int m1 = counters.get_m1(); Transaction<unsigned int> t_in(max_trans_len); Transaction<T> t_out(max_trans_len); // 2nd database scan // direct count of 2-itemsets unsigned int n_tr = 0; while(ii.getNextTransaction(t_in)) { prune_and_map_and_ord_first_iter(t_in, t_out, counters); if (t_out.length >= 2) { int x0; int index_init; for (int t0=0; t0 < (int) t_out.length-1; t0++) { x0 = (int) t_out.t[t0]; index_init = direct_position2_init(x0, m1); for (int t1=t0+1; t1 < (int) t_out.length; t1++) c.incr_cand_count(index_init + (int) t_out.t[t1]); } if (DCI_dataset.VD_is_allocated()) { // write the trans in VD on the fly DCI_dataset.write_t_in_VD(n_tr, t_out); n_tr++; } else // write Dk oo.writeTransaction(t_out); } } // output frequent 2-itemsets and set global pruning mask
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -