-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathsort_doc_topics
More file actions
executable file
·73 lines (60 loc) · 2.21 KB
/
sort_doc_topics
File metadata and controls
executable file
·73 lines (60 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env perl
#===============================================================================
#
# FILE: sort-doc-topics
#
# USAGE: ./sort-doc-topics
#
# DESCRIPTION: read mallet doc-topics output from stdin, write a tab-delimited
# file with the topic proportion data ordered by 0-indexed topic number to
# stdout. I haven't found a fast way to do this in R, so here it is in perl
# (fast).
#
# If the columns of the doc-topics output are 0, 1, 2, ...
# and there are 100 topics,
# then the topic numbers are at positions 2, 4, ... 200
# and the proportions are at 3, 5, ... 201
# but topics themselves are numbered 0 .. 99
# (position 0 is the doc number and position 1 is the doc name)
#
# This script discards the initial comment line from the mallet output
# as well as the trailing tab stop at the end of every data line
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Andrew Goldstone (agoldst), andrew.goldstone@gmail.com
# ORGANIZATION: Rutgers University, New Brunswick
# VERSION: 1.0
# CREATED: 07/24/2012 17:12:29
# REVISION: ---
#===============================================================================
use v5.14; # entails strict, unicode_strings
use autodie;
use utf8; # source code itself is in utf-8
use warnings;
use warnings FATAL => "utf8"; # Unicode encode errors are fatal
use open qw( :std :utf8 ); # utf8 layer for open, stdin/out
# Read and discard the header
my $line = <STDIN>;
$line = <STDIN>;
# Number of topics is: (number of tab delimeters - 2) / 2
# since mallet adds a tab at the end of every line
# So count tabs
my $n = (($line =~ tr/\t//) - 2) / 2;
do {
my @row = split /\t/, $line;
# hash saying which topics are in which columns of this row
my %indexmap = ();
for(my $i = 2;$i <= $n * 2;$i += 2) {
$indexmap{$row[$i]} = $i;
}
# keep the first two columns
print "$row[0]\t$row[1]";
# now output the topic proportions in order
for (my $i = 0;$i <= $n - 1;$i++) {
print "\t", $row[$indexmap{$i} + 1];
}
print "\n";
} while($line = <STDIN>);