-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathKmerTdf_Idf.php
More file actions
126 lines (118 loc) · 3.25 KB
/
KmerTdf_Idf.php
File metadata and controls
126 lines (118 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<?php
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
error_reporting(E_ALL);
include 'kmer_similarity.php';
ini_set('memory_limit','1000M');
//$k = new Kmer();
//$k->init('/Users/jrobertson/Desktop/animals.txt');
class Kmer extends kmer_similarity {
var $entity_assoc = array();
var $num_entity = 0;
var $kmer_weights = array();
var $kmer_freq = array();
var $entity_list = array();
var $kmer;
var $kSize =2;
function init($filename)
{
$this->kmer_weights = array_fill(0,$this->num_combinations,0);
$this->kmer_freq = array_fill(0,$this->num_combinations,0);
$contents = explode("\n",file_get_contents($filename));
$size = $this->getkSize();
$start = time();
foreach($contents as $i => $line)
{
$line = trim($line);
//echo "$line\t".($start-time())."\n";
$start = time();
$this->addEntity($i, $line);
$line = strtolower($line);
$kmers = $this->processLine($line);
foreach($kmers as $k)
{
// echo "--$k--\n";
if(strlen($k) != $size)
{
continue;
}
$pos = $this->calc_position($k);
$this->setFreq($pos, $this->getFreq($pos)+1);
$this->setAssoc($pos, $this->getAssoc($pos).','.$i);
}
}
$this->calc_word_weights();
}
/**
* Method calculates the inverse document frequency to determine how common a word is in the list of instutiones
* and decresses the weight of the word as its frequency increases.
*/
function calc_word_weights()
{
$numDocs = count($this->entity_list);
foreach($this->entity_assoc as $word => $insts)
{
$numInstWithWord = count(explode(",",$insts));
$idf = log($numDocs/$numInstWithWord,10);
$pos = $this->calc_position($word);
$this->setWeight($pos, $idf);
}
}
function setAssoc($pos,$entity)
{
$this->entity_assoc[$pos] = $entity;
}
function getAssoc($pos)
{
if(array_key_exists($pos, $this->entity_assoc))
{
return $this->entity_assoc[$pos];
}
else{
return '';
}
}
function addEntity($pos,$entity)
{
$this->entity_list[$pos] = $entity;
}
function processLine($line)
{
$len = strlen($line);
$kmers = array();
$size = $this->getkSize();
for($i=0;
$i< $len; $i++)
{
$kmers[] = substr($line,$i,$size);
}
return $kmers;
}
function setkSize($s)
{
$this->kSize = $s;
}
function getkSize()
{
return $this->kSize;
}
function getWeight($pos)
{
return $this->kmer_weights[$pos];
}
function setWeight($pos,$w)
{
$this->kmer_weights[$pos] = $w;
}
function getFreq($pos)
{
return $this->kmer_freq[$pos];
}
function setFreq($pos,$count)
{
$this->kmer_freq[$pos] = $count;
}
}