O'Reilly Databases

oreilly.comSafari Books Online.Conferences.

We've expanded our coverage and improved our search! Search for all things Database across O'Reilly!

Search Search Tips

advertisement
AddThis Social Bookmark Button

Print Subscribe to Databases Subscribe to Newsletters

Calculating Entropy for Data Miners
Pages: 1, 2, 3, 4, 5

Conditional Entropy Code

The source code for the conditional entropy class is:


<?php
/**
* @package IT
*/

require_once "Output.php";

/**
* Computes the conditional entropy in bits between
* two data columns and retains intermediate values
* that are useful for generating output reports.
*
* @author Paul Meagher, Datavore Productions
* @version 0.5
* @modified 22/02/2005
*/

class ConditionalEntropy extends Output {

  var $n = 0;

  var $columns = array();
    
  var $row_freqs = array();
  var $col_freqs = array();
  
  var $row_probs = array();
  var $col_probs = array();
  
  var $row_labels = array();
  var $col_labels = array();   
  
  var $joint_freqs = array();
  var $joint_probs = array();

  var $cond_probs  = array();    

  var $bits = 0;

  var $data = array();

  var $table  = "";
  var $select = "";     
  var $where  = "";   

  /* Methods for handling database table input */
          
  function setTable($table) {
    $this->table = $table;
  }

  function setSelect($sql) {
    $this->select = $sql;
  }

  function setWhere($sql) {
    $this->where = " WHERE ".$sql;
  }

  function getSQL() {
    if (empty($this->select)) {
      $sql  = " SELECT ".$this->columns[0].",".$this->columns[1];
      $sql .= " FROM $this->table ";
      if (empty($this->where))
        return $sql;
      else
        return $sql . $this->where;        
    } else
      return $this->select;      
  }
     
  function getJointAndMarginalFrequenciesFromTable() {
    global $db;    
    $sql    = $this->getSQL();
    $result = $db->query($sql);    
    if (DB::isError($result))
      die($result->getMessage());
    else {      
      $n = 0;
      while($row = $result->fetchRow()) {        
        $a = $row[$this->columns[0]];
        $b = $row[$this->columns[1]];
        $this->joint_freqs[$a][$b]++;
        $this->row_freqs[$a]++; // aka row marginals
        $this->col_freqs[$b]++; // aka col marginals
        $n++;
      }
      $this->n = $n;
    }  
    return true;
  }

  /* Methods for handling array input */
      
  function setArray($data) {
    $this->data = $data;
  }

  function getJointAndMarginalFrequenciesFromArray() {
    $this->n = count($this->data);        
    for ($i=0; $i < $this->n; $i++) {
      $a = $this->data[$i][0];
      $b = $this->data[$i][1];
      $this->joint_freqs[$a][$b]++;
      $this->row_freqs[$a]++; // aka row marginals
      $this->col_freqs[$b]++; // aka col marginals
    }  
  }

  /* Shared methods */

  function setAntecedent($column, $label="") {
    $this->columns[0] = $column;
    $this->labels[0]  = $label;    
  }

  function setConsequent($column, $label="") {
    $this->columns[1] = $column;
    $this->labels[1] = $label;    
  }

  function setConditional($conditional, $labels="") {
    $cond_pieces          = explode("|", $conditional);          

    $this->columns[0]  = trim($cond_pieces[1]);
    $this->columns[1]  = trim($cond_pieces[0]);    
    if ($labels != "") {
      $label_pieces       = explode("|", $labels);          
      $this->labels[0] = trim($label_pieces[1]);
      $this->labels[1] = trim($label_pieces[0]);          
    } else {
      $this->labels[0] = $this->columns[0]{0};
      $this->labels[1] = $this->columns[1]{0};      
    }
  }

  function clear() {   
    $this->n           = 0;    
    $this->row_freqs   = array();
    $this->col_freqs   = array();
    $this->row_probs   = array();
    $this->col_probs   = array();
    $this->row_labels  = array();
    $this->col_labels  = array();   
    $this->joint_freqs = array();
    $this->joint_probs = array();
    $this->cond_probs  = array();  
    $this->bits        = 0;        
  }

  function analyze() {
    $this->clear();
    if (empty($this->table))
      $this->getJointAndMarginalFrequenciesFromArray();
    else
      $this->getJointAndMarginalFrequenciesFromTable();     
    $this->row_labels = array_keys($this->row_freqs);
    $this->col_labels = array_keys($this->col_freqs);    
    $this->getJointAndMarginalProbabilities();
    $this->getConditionalProbabilities();      
    $this->getConditionalEntropyScore();    
  }       

  function getJointAndMarginalProbabilities() {
    foreach($this->joint_freqs AS $key1=>$array) {
      foreach($array AS $key2=>$val2) {
        $this->joint_probs[$key1][$key2] = $this->joint_freqs[$key1][$key2] / 
		   $this->n;
        $this->row_probs[$key1] += $this->joint_probs[$key1][$key2];
        $this->col_probs[$key2] += $this->joint_probs[$key1][$key2];
      }
    } 
  }
  
  function getConditionalProbabilities() {
    foreach($this->joint_probs AS $key1=>$array)
      foreach($array AS $key2=>$val2)         
        $this->cond_probs[$key1][$key2] = $this->joint_probs[$key1][$key2] /
                                             $this->row_probs[$key1];
  }

  function loadJointProbabilities($joint_probs) {
    $this->clear();
    $this->joint_probs = $joint_probs;
    foreach($joint_probs AS $key1=>$array) {
      foreach($array AS $key2=>$val2) {
        $this->row_probs[$key1] += $joint_probs[$key1][$key2];
        $this->col_probs[$key2] += $joint_probs[$key1][$key2];        
      }
    }
    $this->getConditionalProbabilities();      
    $this->getConditionalEntropy();
  }

  function getConditionalEntropyScore() {
    foreach($this->joint_freqs AS $key1=>$array)
      foreach($array AS $key2=>$val2)
        $this->bits -= $this->joint_probs[$key1][$key2] *
                          log($this->cond_probs[$key1][$key2], 2);
  }
}
?>

Mutual Information

The formula for computing mutual information uses formulas encountered earlier:

I(X;Y) = H(X) - H(X|Y)

The following PHP script computes the mutual information between age and buys_computer:


<?php
/**
* @package IT
*
* Example of how to compute mutual info.
*/
include_once "../config.php";
include_once PHPMATH."/IT/Entropy.php";
include_once PHPMATH."/IT/ConditionalEntropy.php";

$e = new Entropy;
$e->setTable("Customers");
$e->setColumn("buys_computer");
$e->analyze();
echo "H(buys_computer) = ". $e->bits ."<br />";

$ce = new ConditionalEntropy;
$ce->setTable("Customers");
$ce->setConditional("buys_computer|age");
$ce->analyze();
echo "H(buys_computer | age) = ". $ce->bits ."<br />";

$mutual_info = $e->bits - $ce->bits;
echo "I(age;buys_computer) = H(buys_computer) - H(buys_computer | age) = 
   $mutual_info <br />";
?>

The output of this script looks like this:

H(buys_computer) = 0.940285958671
H(buys_computer | age) = 0.693536138896
I(age;buys_computer) = H(buys_computer) - H(buys_computer | age) = 0.246749819774

If the conditional entropy score is very low (that is, close to 0), this will have the effect of producing a relatively high mutual information score. A high mutual information score indicates that you can use one variable to reduce your uncertainty about the other variable. Conversely, a low mutual information score shows that you cannot use one variable to reduce your uncertainty about the values of the other random variable. A signal transferred from the age column is not well preserved in the buys_computer column. In signal theory terms, the "channel" between them is full of distortion.

The term mutual information derives from the fact that you will obtain the same mutual information score no matter which of the two variables you use as the conditioning variable in the equation. Instead of conditioning buys_computer on age, you can condition age on buys_computer and get the same result:

H(age) = 1.57740628285
H(age | buys_computer) = 1.33065646308
I(age;buys_computer) = H(age) - H(age | buys_computer) = 0.246749819774

A few other fundamental relationships are worth pointing out:

H(A, B) = H(A) + H(B) - I(A;B)
H(A, B) = H(A) + H(B|A)
H(A, B) = H(B) + H(A|B)

Pages: 1, 2, 3, 4, 5

Next Pagearrow




Tagged Articles

Be the first to post this article to del.icio.us

Related to this Article

Data Jujitsu: The Art of Turning Data into Product Data Jujitsu: The Art of Turning Data into Product
November 2012
$0.00 USD

Designing Great Data Products Designing Great Data Products
March 2012
$0.00 USD

Sponsored Resources

  • Inside Lightroom
Advertisement
O'reilly

© 2013, O’Reilly Media, Inc.

(707) 827-7019 (800) 889-8969

All trademarks and registered trademarks appearing on oreilly.com are the property of their respective owners.

About O'Reilly

  • Academic Solutions
  • Jobs
  • Contacts
  • Corporate Information
  • Press Room
  • Privacy Policy
  • Terms of Service
  • Writing for O'Reilly

Community

  • Authors
  • Community & Featured Users
  • Forums
  • Membership
  • Newsletters
  • O'Reilly Answers
  • RSS Feeds
  • User Groups

Partner Sites

  • makezine.com
  • makerfaire.com
  • craftzine.com
  • igniteshow.com
  • PayPal Developer Zone
  • O'Reilly Insights on Forbes.com

Shop O'Reilly

  • Customer Service
  • Contact Us
  • Shipping Information
  • Ordering & Payment
  • The O'Reilly Guarantee