diff --git a/app/assets/stylesheets/general.css.scss b/app/assets/stylesheets/general.css.scss
index 9f2c101..2fb7711 100644
--- a/app/assets/stylesheets/general.css.scss
+++ b/app/assets/stylesheets/general.css.scss
@@ -1,350 +1,355 @@
.main-container {
height:auto;
min-height:100vh;
padding-top:1rem;
}
+.pipeline-figure {
+ height:80vh;
+ width:auto;
+}
+
.doc-table tr td,
.doc-table tr th,
.version-table tr td {
text-align:left
}
.border-left {
border-left:1px solid #ccc;
}
.border-right {
border-right:1px solid #ccc;
}
.border-top {
border-top:1px solid #ccc;
}
.border-bottom {
border-top:1px solid #ccc;
}
.inline-block {
display:inline-block;
}
.navbar.navbar-top {
-moz-box-shadow: 0 2px 2px #808080;
-webkit-box-shadow: 0 2px 2px #808080;
box-shadow: 0 2px 2px #808080;
}
.navbar.navbar-bottom {
border-top:1px solid #ccc;
}
.side-index-container {
padding:0;
z-index:0;
}
.side-index-container .side-index {
height:100vh;
overflow:auto;
width:100%;
}
.side-index-container .side-index-list {
margin-bottom:50vh;
padding-left:5px;
padding-right:5px;
padding-top:1rem;
}
.topbar-padding {
padding-top:56px;
}
.topbar-margin {
margin-top:56px;
}
.dropdown-item {
cursor:pointer;
}
.float-right {
float:right
}
.align-center {
text-align:center;
}
.align-justify {
text-align:justify;
}
.align-left {
text-align:left;
}
.align-right {
text-align:right;
}
ul.no-bullets li {
list-style-type:none;
}
.full-width {
width:100%;
}
.full-height {
height:100%;
}
.align-items-center {
align-items:center;
}
.hidden {
display:none;
}
.row {
display: -webkit-box;
display: -webkit-flex;
display: -ms-flexbox;
display: flex;
-webkit-flex-wrap: wrap;
-ms-flex-wrap: wrap;
flex-wrap: wrap;
}
.video-wrapper {
position: relative;
padding-bottom: 56.25%; /* 16:9 */
padding-top: 25px;
height: 0;
iframe {
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
}
}
#popup_window {
position: absolute;
z-index:100000;
padding:15px;
display: none;
background: #ccc;
border: 1px solid;
}
#popup_window_close {
float:right;
font-size:20px;
margin-top:-19px;
margin-right:-10px;
cursor:pointer;
}
.tip-window-container {
position:relative;
display:block;
width:0;
height:0;
}
.tip-window {
position: absolute;
display:block;
z-index:1;
margin: 15px 0 0 0;
padding:10px;
background: #e6e6e6;
border: 1px solid #333;
color:#333;
width:300px;
text-align:left;
white-space:normal;
}
.tip-window:before,
.tip-window:after {
position:absolute;
content:"";
width:0;
height:0;
}
.tip-window.right:before,
.tip-window.right:after {
right:10px;
}
.tip-window.left:before,
.tip-window.left:after {
left:10px;
}
.tip-window:before {
top:-15px;
border-bottom:15px solid #333;
border-left:15px solid transparent;
border-right:15px solid transparent;
}
.tip-window:after {
top:-14px;
z-index:1;
border-bottom:15px solid #e6e6e6;
border-left:15px solid transparent;
border-right:15px solid transparent;
}
.tip-window-close {
cursor:pointer;
}
.title_popup {white-space:nowrap;font-weight:bold}
.infos {
position:static;
}
.background-col-1 {
background-color:#e6e6e6;
}
.background-col-2 {
color:#fff;
background-color:#999;
}
.row-padding-10 {
padding-top:10vh;
padding-bottom:10vh;
}
.row-padding-5 {
padding-top:5vh;
padding-bottom:5vh;
}
.row-padding-2 {
padding-top:2vh;
padding-bottom:2vh;
}
.row-padding-bottom-20 {
padding-bottom:20vh;
}
.v-align-middle {
height:100%;
transform:translate(0, 35%);
}
.back-to-top {
color:#999;
text-decoration: none;
&:hover,
&:focus,
&:active {
color:#ccc;
text-decoration: none;
}
}
.content {
position:static;
display: table;
margin-right: auto;
margin-left: auto;
margin-top: 60px;
margin-bottom: 60px;
.left {
float:left;
margin-right:20px;
}
.right {
float:right;
margin-left:20px;
}
.align-left {
text-align:left;
}
.align-right {
text-align:right;
}
.centered-div {
width: 260px;
line-height: 300px;
}
.centered-span {
display: inline-block;
vertical-align: middle;
line-height: normal;
}
}
a {
color: #333;
&.white {
color: #fff;
}
&:link, &:visited {
text-decoration: none;
}
&:hover {
text-decoration: underline;
&.button,
&.tab,
&.subtab,
&.help-button,
&.slider,
&.slide-open,
&.slide-close,
&.test-button {
text-decoration: none;
}
&.slide-open, &.slide-close {
text-shadow: 1px 0px #333, -1px -0px #333, 0px -1px #333, 0px 1px #333;
}
}
}
h1 {
font-weight: 500;
font-size: 28px;
line-height: 34px;
}
h2 {
font-weight: 500;
font-size: 24px;
line-height: 30px;
}
h3 {
font-weight: 500;
font-size: 20px;
line-height: 25px;
}
pre {
font-family: verdana, arial, helvetica, sans-serif;
background-color: #eee;
padding: 10px;
font-size: 12px;
white-space: pre-wrap;
white-space: -moz-pre-wrap;
white-space: -pre-wrap;
white-space: -o-pre-wrap;
word-wrap: break-word;
}
diff --git a/app/controllers/jobs_controller.rb b/app/controllers/jobs_controller.rb
index a6e85b5..e6b667c 100644
--- a/app/controllers/jobs_controller.rb
+++ b/app/controllers/jobs_controller.rb
@@ -1,832 +1,791 @@
class JobsController < ApplicationController
before_action :set_job, only: [:show, :edit, :update, :destroy, :serve, :serve_archive, :serve_stderr, :view, :refresh, :clone]
- before_action :check_storage_limit, only: [:new, :create]
+ before_action :check_storage_limit, only: [:new, :create, :update, :clone]
# before_action :authenticate_user!, only: [:show, :edit, :update, :destroy, :serve]
def check_storage_limit
if (controller_name == 'jobs' and ['create', 'new', 'update'].include? action_name)
if current_user and current_user.storage_quota > 0 and current_user.total_jobs_size > current_user.storage_quota
flash[:notice] = "Sorry but you exceeded your storage quota. Please clean up old analyses first."
redirect_to root_url
end
end
end
def kill_job j
tmp_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + j.user_id.to_s + j.key
main_pid = File.read(tmp_dir + ".pid") if File.exist?(tmp_dir + ".pid")
# kill job if one already running
existing_main_job = `ps -ef | grep #{j.key} | grep #{main_pid} | grep -v 'grep'`
puts "Existing main job: " + existing_main_job.to_json
# pids=[]
if main_pid and !existing_main_job.empty?
lines = `ps -ef | grep #{j.key} | grep -v 'grep'`.split("\n").select{|l| !l.empty?}
pids = lines.map{|l| t= l.split(/\s+/); t[1]}
pids.unshift(main_pid)
if pids.size > 0
pids.each do |pid|
cmd = "kill #{pid}"
`#{cmd}`
end
end
end
# delete log.json
log_json = tmp_dir + "output" + "log.json"
#logger.debug("DEBUG: " + log_json.to_s)
File.delete log_json if File.exist? log_json
end
def clone
tmp_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key
@new_job = nil
if current_user
@new_job = @job.dup
session[:current_key] = create_key()
@new_job.key = session[:current_key]
else
current_job = Job.where(:key => session[:current_key]).first
if current_job
### kill current job
kill_job(current_job)
@new_job = current_job
@new_job.update_attributes(:form_json => @job.form_json,
:output_json => @job.output_json,
:name => @job.name,
:description => @job.description,
:status => @job.status)
else
@new_job = @job.dup
@new_job.key = session[:current_key]
end
end
### clone
@new_job.sandbox = (current_user) ? false : true
@new_job.user_id = (current_user) ? current_user.id : 1
new_tmp_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @new_job.user_id.to_s
Dir.mkdir(new_tmp_dir) if !File.exist? new_tmp_dir
new_tmp_dir += @new_job.key
if File.exist? new_tmp_dir
FileUtils.rm_r new_tmp_dir
Dir.mkdir new_tmp_dir
end
FileUtils.cp_r tmp_dir.to_s + "/.", new_tmp_dir
### rename cloned archive
FileUtils.mv (new_tmp_dir + (@job.key + "." + APP_CONFIG[:archive_format])), (new_tmp_dir + (@new_job.key + "." + APP_CONFIG[:archive_format])) if File.exist?(new_tmp_dir + (@job.key + "." + APP_CONFIG[:archive_format]))
### change filepaths in the result file
cmd = "perl -i -pe 's/\\/#{@job.user_id}\\/#{@job.key}\\//\\/#{@new_job.user_id}\\/#{@new_job.key}\\//g' #{new_tmp_dir}/output/log.json"
#logger.debug("CMD: #{cmd}")
`#{cmd}`
### delete tmp_dir
FileUtils.rm_r new_tmp_dir + "tmp" if File.exist?(new_tmp_dir + "tmp")
@new_job.name += " copy"
if @new_job.save
redirect_to job_path(@new_job.key)
end
end
def serve
if readable? @job
sleep 0.5
- job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user.id.to_s + @job.key
+ job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user_id.to_s + @job.key
filename = params[:filename]
filepath = job_dir + filename
if File.exist?(filepath)
send_file filepath.to_s, type: params[:content_type] || 'text', disposition: (!params[:display]) ? ("attachment; filename=" + filename.gsub("/", "_")) : ''
else
flash[:notice] = 'Sorry, this file is no longer available, please try again later.'
redirect_to job_path(@job.key)
end
end
end
def serve_archive
if readable? @job
`rails make_job_archive[#{@job.key}]`
- job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user.id.to_s + @job.key
+ job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user_id.to_s + @job.key
filename = "#{@job.key.to_s}.#{APP_CONFIG[:archive_format]}"
filepath = job_dir + filename
t = (APP_CONFIG[:archive_format] == 'zip') ? 'zip' : 'tar+gzip'
if File.exist?(filepath)
send_file filepath.to_s, type: t, disposition: "attachment; filename=" + filename.gsub("/", "_")
end
end
end
def serve_stderr
if readable? @job
`rails format_job_stderr[#{@job.key}]`
- job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user.id.to_s + @job.key
+ job_dir = Pathname.new(APP_CONFIG[:data_dir]) + 'users' + @job.user_id.to_s + @job.key
filename = "stderr.txt"
filepath = job_dir + filename
if File.exist?(filepath)
send_file filepath.to_s, type: 'text', disposition: "attachment; filename=" + filename
end
end
end
def read_file_header
if params[:file_key]
new_data = []
-
- user_id = (current_user) ? current_user.id.to_s : "1"
- dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + user_id + session[:current_key] + 'input'
+ @job = Job.where(:key => session[:current_key]).first
+ dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key + 'input'
filename = params[:file_key] + ".txt"
filepath = dir + filename
test = ""
if File.exist? filepath
lines = []
File.open(filepath, "r") do |f|
while(l = f.gets) do
lines.push(l.chomp)
end
end
j=0
(0 .. lines.size-1).to_a.each do |i|
if !lines[i].match(/^#/)
j=i
break
end
end
i = (j > 1) ? j-1 : 0
headers = lines[i].split("\t")
if params[:add_blank]
new_data.push({label:"", value:""});
end
if headers.size > 0
headers.each do |header|
new_data.push({:label => header, :value => header});
end
end
end
render :text => new_data.to_json
else
render :nothing => true
end
end
def read_file_column
if params[:file_key]
new_data = []
user_id = (current_user) ? current_user.id.to_s : "1"
dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + user_id + session[:current_key] + 'input'
filename = params[:file_key] + ".txt"
filepath = dir + filename
if File.exist? filepath
lines = File.readlines(filepath)
j=0
(0 .. lines.size-1).each do |i|
if !lines[i].match(/^#/)
j = i
break
end
end
i = (j > 1) ? j-1 : 0
header_els = lines[i].chomp.split("\t")
pos_col = 0
(0 .. header_els.size).each do |k|
if header_els[k] == params[:col_name]
pos_col = k
break
end
end
max_val=0
(i .. lines.size-1).each do |j|
t = lines[j].split("\t")
n = t[pos_col].split(";").size
if n > max_val
max_val = n
end
end
if max_val > 0
(1 .. max_val+1).each do |j|
new_data.push({:label => j, :value => j});
end
end
end
render :text => new_data.to_json
else
render :nothing => true
end
end
# BUILD THE INDEX OF JOBS FROM CURRENT USER OR ALL (ADMIN ONLY)
def index
@scope = params[:scope]
if current_user
if current_user.role == "admin" and @scope == 'global'
@jobs = Job.all
else
@jobs = current_user.jobs.all
end
end
get_total_jobs_size()
respond_to do |format|
format.html{
if !current_user
redirect_to "/welcome"
end
}
end
end
def get_views
@h_views = {}
View.all.each do |v|
@h_views[v.name]= v
end
end
def get_statuses
log_file = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @user.id.to_s + @job.key + 'output' + 'log.json'
- #stdout_file = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @user.id.to_s + @job.key + 'output' + 'stdout.log'
- #stderr_file = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @user.id.to_s + @job.key + 'output' + 'stderr.log'
-
- #if File.exist? stdout_file and !File.size(stdout_file) != 0
- # @stdout_log = JSON.parse(File.read(stdout_file)).select {|e| e[0] != nil}
- #end
- #if File.exist? stderr_file and !File.size(stderr_file) != 0
- # @stderr_log = JSON.parse(File.read(stderr_file)).select {|e| e[0] != nil}
- #end
@h_statuses = {}#'completed' => 1, 'pending' => 2, 'running' => 3, 'failed' => 4}
Status.all.each do |s|
@h_statuses[s.name]= s
end
if File.exist? log_file and !File.size(log_file) != 0
@log_json = JSON.parse(File.read(log_file))
@final_json = {
:global_status => nil,
:status_by_step => {},
:status_by_substep => {},
:global_status_by_step => {},
:messages_by_step => {}
}
@h_icons={
'description' => 'fa fa-info-circle',
'description-item' => 'ml-3',
'comment' => 'fa fa-info-circle',
'output' => 'fa fa-file-text-o',
'warning' => 'fa fa-exclamation-triangle icon-warning',
'error' => 'fa fa-exclamation-triangle icon-danger'
}
@test = ""
### datasets
@log_json.select{|e| ['dataset', 'map'].include?(e['type'])}.each do |cat|
@final_json[:status_by_step][cat['name']] ||= []
@final_json[:global_status_by_step][cat['name']] ||= nil
cat['log'].select{|e| e['type'] != 'file'}.each do |el|
@final_json[:status_by_substep][el['name']]=[]
step_key = cat['name'] + "_" + el['name']
@final_json[:messages_by_step][step_key] ||= []
tmp_global_status = nil
if el['operations']
el['operations'].each do |el2|
@final_json[:status_by_substep][el['name']].push({:name => el2['name'], :status => el2['status'], :execution_time => el2['execution_time'], :cat => cat['name']})
@final_json[:messages_by_step][step_key].push({:name => el2['name'], :messages => el2['messages'], :cat => cat['name']})
if !tmp_global_status or (@h_statuses[el2['status']] and @h_statuses[el2['status']].precedence > @h_statuses[tmp_global_status].precedence)
tmp_global_status = el2['status']
end
end
end
@final_json[:status_by_step][cat['name']].push({:name => el['name'], :status => (el['status'] || tmp_global_status), :execution_time => el['execution_time']})
if !@final_json[:global_status_by_step][cat['name']] or (@h_statuses[el['status']] and @h_statuses[el['status']].precedence > @h_statuses[@final_json[:global_status_by_step][cat['name']]].precedence)
@final_json[:global_status_by_step][cat['name']] = el['status']
end
if !@final_json[:global_status] or (@h_statuses[el['status']] and @h_statuses[el['status']].precedence > @h_statuses[@final_json[:global_status]].precedence)
@final_json[:global_status] = el['status']
end
@final_json[:messages_by_step][step_key].push({:name => el['name'], :messages => el['messages']})
end
end
### analyses
analyses = @log_json.select{|e| e['type'] == 'analysis'}
if analyses.size > 0
analyses.first['log'].each do |el|
@final_json[:global_status_by_step][el['name']] ||= nil
@final_json[:status_by_step][el['name']] ||= []
@final_json[:messages_by_step][el['name']] ||= []
tmp_global_status = nil
if el['levels']
el['levels'].each do |el2|
@final_json[:status_by_step][el['name']].push({:name => el2['name'], :status => el2['status'], :execution_time => el2['execution_time']})
@final_json[:messages_by_step][el['name']].push({:name => el2['name'], :messages => el2['messages']})
if !tmp_global_status or (@h_statuses[el2['status']] and @h_statuses[el2['status']].precedence > @h_statuses[tmp_global_status].precedence)
tmp_global_status = el2['status']
end
end
end
if !@final_json[:global_status_by_step][el['name']] or (@h_statuses[el['status']] and @h_statuses[el['status']].precedence > @h_statuses[@final_json[:global_status_by_step][cat['name']]].precedence)
@final_json[:global_status_by_step][el['name']] = el['status']
end
if !@final_json[:global_status] or (@h_statuses[el['status']] and @h_statuses[el['status']].precedence > @h_statuses[@final_json[:global_status]].precedence)
@final_json[:global_status] = el['status']
end
@final_json[:status_by_step][el['name']].push({:name => el['name'], :status => (el['status'] || tmp_global_status), :execution_time => el['execution_time']})
@final_json[:messages_by_step][el['name']].push({:name => el['name'], :messages => el['messages']})
end
end
@update = 0
@status_vector = []
if @final_json
if ['primary_dataset', 'map', 'secondary_dataset'].include? session[:selected_view]
if @final_json[:status_by_step][session[:selected_view]]
@status_vector += @final_json[:status_by_step][session[:selected_view]].map{|e| @h_statuses[e[:status]].id}
@final_json[:status_by_substep].keys.sort.each do |k|
@status_vector += @final_json[:status_by_substep][k].select{|e| e[:cat] == session[:selected_view]}.map{|e| @h_statuses[e[:status]].id}
end
end
@final_json[:messages_by_step].keys.sort.each do |k|
@status_vector += @final_json[:messages_by_step][k].select{|e| e[:cat] == session[:selected_view] and e[:messages]}.map{|e| e[:messages].size}
end
else
if @final_json[:status_by_step][session[:selected_view]]
@status_vector += @final_json[:status_by_step][session[:selected_view]].map{|e| @h_statuses[e[:status]].id}
end
if @final_json[:messages_by_step][session[:selected_view]]
@status_vector += @final_json[:messages_by_step][session[:selected_view]].select{|e| e[:messages]}.map{|e| e[:messages].size}
end
end
end
if session[:status_vector] != @status_vector
@update = 1
end
end ## end file exist
end
def refresh
# get_basic_info()
get_statuses()
get_job_size()
render :partial => "refresh"
end
def view
# @user = (current_user) ? current_user : User.where(:username => 'guest').first
get_basic_info()
get_statuses()
get_job_size()
get_views()
@form_json = JSON.parse @job.form_json if @job.form_json and !@job.form_json.empty?
if !session[:current_level]
session[:current_level] = (@form_json['bin_levels'] and @form_json['bin_levels'].size > 0) ? @form_json['bin_levels'][0] : nil
end
session[:current_level] = params[:current_level].to_i if params[:current_level]
@data_json = nil
@filename = nil
@imagename = nil
@description = ''
@warning = ''
@error = ''
@i = 0
@log = ''
if !['details', 'primary_dataset', 'secondary_dataset', 'map'].include?(params[:partial])
e = @log_json.select{|el| el['type'] == 'analysis'}.first['log'].select{|el| el['name'] == params[:partial]}.first
if e['levels'] #and @form_json['bin_levels']
@i = @form_json['bin_levels'].index(session[:current_level].to_s)
@form_json['bin_levels'].each_index do |li|
l = @form_json['bin_levels'][li]
@log += ">" + l.to_s + ";"
if l == session[:current_level].to_s
@i = li
@log += '!!'
end
end
e2 = e['levels'][@i]
@filename = e2['messages'].select{|el| el['output'] and el['output'].match(/#{@h_views[params[:partial]].data_format}$/)}.map{|el| el['output']}.first
@imagename = e2['messages'].select{|el| el['output'] and el['output'].match(/pdf|png|jpg$/)}.map{|el| el['output']}.first
@description = e2['messages'].select{|el| el['description']}.map{|el| el['description']}
@warning = e2['messages'].select{|el| el['warning']}.map{|el| el['warning']}
@error = e2['messages'].select{|el| el['error']}.map{|el| el['error']}
else
@i = -1
@filename = e['messages'].select{|el| el['output'] and el['output'].match(/#{@h_views[params[:partial]].data_format}$/)}.map{|el| el['output']}.first
@imagename = e['messages'].select{|el| el['output'] and el['output'].match(/pdf|png|jpg$/)}.map{|el| el['output']}.first
@description = e['messages'].select{|el| el['description']}.map{|el| el['description']}
@warning = e['messages'].select{|el| el['warning']}.map{|el| el['warning']}
@error = e['messages'].select{|el| el['error']}.map{|el| el['error']}
end
end
- #end
-
if @filename and File.exist? @filename
@data_json = File.read(@filename)
end
session[:selected_view] = params[:partial]
json_file = Rails.root.join('lib', 'genocrunch_console', 'etc', 'genocrunchlib.json')
file_content = File.read(json_file)
h = JSON.parse(file_content)
@h_form_choices = h['choices']
render :partial => "view_" + params[:partial]
end
# ALLOW SESSION VARIABLE UPDATE ON SHOW
# THIS ALLOW CONDITIONAL RENDERING OF EITHER FIGURES OR EDIT FORM PARTIALS
def show
- # @user = (current_user) ? current_user : User.where(:username => 'guest').first
- # if current_user.role == "admin" and session[:which] == "all"
- # @jobs = Job.all
- # @users = User.all
- # else#if current_user
- # @jobs = current_user.jobs.all
- # end
- # session[:context] = params[:context]
- # logger.debug("JOB: " + @job.to_json)
- #check_box belongs_to
get_basic_info()
get_statuses()
get_job_size()
get_views()
@analyses = []
@inputs = []
@h_job_form = JSON.parse(@job.form_json)
session[:current_level] = (@h_job_form['bin_levels'] && @h_job_form['bin_levels'][0]) || nil
@h_form['fields']['Inputs'].select{|f| f['type'] == 'file'}.each do |f|
if @h_job_form[f['id']]
@inputs.push({:id => f['id'], :label => f['label'] || f['id'].gsub(/_/, ' ').capitalize})
end
end
@h_form['fields']['Analysis'].select{|f| f['type'] == 'check_box' and !f['belongs_to']}.each do |f|
if @h_job_form[f['id']] #and @h_job_form[f['id']]==true
@analyses.push({:id => f['id'], :label => f['label'] || f['id'].gsub(/_/, ' ').capitalize})
end
end
if !readable? @job
redirect_to root_path, notice: 'Cannot access this resource.'
end
end
# ALLOW UPDATE OF INDEX UPON CHANGE
def update_index
@previous_jobs = params[:previous_jobs] || nil
@scope = params[:scope]
if current_user.role == "admin" and @scope == 'global'
@jobs = Job.all
else
@jobs = current_user.jobs
end
get_total_jobs_size()
end
def get_basic_info
json_file = Rails.root.join('lib', 'genocrunch_console', 'etc', 'genocrunchlib.json')
file_content = File.read(json_file)
@h_form = JSON.parse(file_content)
@h_tips = JSON.parse(File.read(Rails.root.join('public', 'app', 'tips.json')))
@h_help = {}
@h_field_groups = {}
@h_form['fields'].each_key{|card_title|
@h_form['fields'][card_title] = @h_form['fields'][card_title].select{|field| field['scope'] != 'cli_only'}
@h_form['fields'][card_title].map{|field|
@h_help[field['id']] = field['help']
if field['belongs_to']
@h_field_groups[field['belongs_to']]||=[]
@h_field_groups[field['belongs_to']].push(field)
end
}
}
end
# SET A NEW JOB
def new
@user = (current_user) ? current_user : User.where(:username => 'guest').first
-
- if @user.storage_quota > 0 and @user.total_jobs_size > @user.storage_quota
- flash[:notice] = 'Sorry, the storage quota is exceeded.\nPlease clean up old analysis first.'
- redirect_to root_url
- return
- end
-
@job = @user.jobs.new
session[:current_key] = create_key()
@job.key = session[:current_key]
get_basic_info
@default = {}
end
def set_fields p
@missing_fields = []
@present_fields = []
get_basic_info
@h_fields ={}
@h_form['fields'].each_key{|card_title|
@h_form['fields'][card_title].map{|field|
@h_fields[field['id']]=field
}
}
@log = ''
flag=0
if p
flag=1
- ### check if some parameters are not allowed
+ ### check if some parameters are not allowed
p.each_key do |k|
if !@h_fields[k]
flag = 0
break
end
end
- ### check if all parameters required are submitted
+ ### check if all parameters required are submitted
@h_fields.keys.select{|k| @h_fields[k]['optional'] == false}.each do |k|
#logger.debug("EXPLORE field " + k)
if (@h_fields[k]['type']== 'file' and ((p[k] and !p[k].blank?
) or !params[:p2][k].empty?) ) or (p[k] and !p[k].empty?)
@present_fields.push(@h_fields[k])
else
@missing_fields.push(@h_fields[k])
flag = 0
end
end
end
return flag
end
def create_dirs
- user_id = (current_user) ? current_user.id.to_s : "1"
-
- dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + user_id
+ dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s
Dir.mkdir dir if !File.exist? dir
- dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + user_id + @job.key
+ dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key
Dir.mkdir dir if !File.exist? dir
end
def write_files p
- user_id = @job.user.id.to_s
- #user_id = (current_user) ? current_user.id.to_s : "1"
- dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + user_id + @job.key + 'input'
+ dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key + 'input'
Dir.mkdir dir if !File.exist? dir
form_json = JSON.parse @job.form_json if @job.form_json and !@job.form_json.empty?
fields = ['primary_dataset', 'map', 'secondary_dataset']
fields.each do |k|
#logger.debug("Key:" + k)
filepath = dir + ( k + '.txt')
content = (p[k]) ? p[k].read : nil
if content and !content.empty?
params[:p2][k] = p[k].original_filename
File.open(filepath, 'w') do |f|
#logger.debug("CONTENT FILE: " + content)
f.write content
end
`dos2unix #{filepath}`
`mac2unix #{filepath}`
elsif form_json and form_json[k.to_s]
params[:p2][k] = form_json[k.to_s]['original_filename']
params[:p][k] = form_json[k.to_s]
end
end
end
# CREATE A NEW JOB
def create
@job = Job.new(:key => params[:tmp_key])
-
+
+ @job.user_id = (current_user) ? current_user.id : 1
+ @job.sandbox = (current_user) ? false : true
+
create_dirs()
write_files(params[:p])
@valid_job = set_fields params[:p]
@job.name = params[:p][:name]
@job.description = params[:p][:description]
@job.form_json = params[:p].to_json
@default = params[:p]
- if current_user
- @job.user_id = current_user.id
- @job.sandbox = false
- end
-
## not possible to create a new job with an existing key
@valid_job = 0 if !@job.key.empty? and Job.where(:key => @job.key).first
respond_to do |format|
if @valid_job==1 and @job.save
@job.delay.perform # run analysis as delayed job
session[:agree_with_terms] = true
session[:current_key]=create_key() if current_user
format.html { redirect_to job_path(@job.key) }
format.json { render action: 'new', status: :created, location: @job }
else
format.html { render action: 'new' }
format.json { render json: @job.errors, status: :unprocessable_entity }
end
end
end
def edit
get_basic_info
@default = JSON.parse(@job.form_json)
params[:p2]={}
['primary_dataset', 'map', 'secondary_dataset'].select{|k| @default[k]}.each do |k|
params[:p2][k] = @default[k]['original_filename']
end
end
# PATCH/PUT /jobs/1
# PATCH/PUT /jobs/1.json
def update
@update_only = params[:job][:update_only] if params[:job]
if @update_only
@job.update_attributes(@update_only.to_sym => params[:job][@update_only.to_sym])
# Update form_json
form_json = JSON.parse(@job.form_json)
form_json[@update_only] = params[:job][@update_only.to_sym]
@job.update_attributes(:form_json => form_json.to_json)
# Update params.json files
data_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key.to_s
for p in ['input', 'output'] do
p_fp = data_dir + p + "params.json"
if File.exist?(p_fp)
p_content = JSON.parse(File.read(p_fp))
p_content[@update_only] = params[:job][@update_only.to_sym]
File.open(p_fp, 'w') do |f|
f.write p_content.to_json
end
end
end
else
write_files(params[:p])
valid_job = set_fields params[:p]
h_job = {
:name => params[:p][:name],
:description => params[:p][:description],
:form_json => params[:p].to_json
}
@default = JSON.parse(@job.form_json)
@default = params[:p]
respond_to do |format|
if valid_job==1 and @job.update_attributes(h_job)
kill_job(@job)
@job.delay.perform # run analysis as delayed job
session[:agree_with_terms] = true
format.html {redirect_to job_path(@job.key)}
format.json { head :no_content }
else
format.html { render action: 'edit'}
format.json { render json: @job.errors, status: :unprocessable_entity }
end
end
end
end
# DELETE /jobs/1
# DELETE /jobs/1.json
def destroy
job_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key.to_s
FileUtils.rm_rf(job_dir)
@job.destroy
respond_to do |format|
format.html { redirect_to jobs_url }
format.json { head :no_content }
end
end
private
def get_job_size
data_dir = Pathname.new(APP_CONFIG[:data_dir]) + "users" + @job.user_id.to_s + @job.key.to_s
js = `du -sb #{data_dir} | cut -f1`.to_i
if js != @job.size.to_i
@job.update_column(:size, js)
end
end
def get_total_jobs_size
if current_user
@total_jobs_size = current_user.jobs.all.sum(&:size).to_i
if @total_jobs_size != current_user.total_jobs_size.to_i
current_user.update_column(:total_jobs_size, @total_jobs_size)
end
end
end
# Use callbacks to share common setup or constraints between actions.
def set_job
-# if current_user.role == "admin"
-# @job = Job.all.find(params[:key])
-# else
-# @job = current_user.jobs.find(params[:key])
-# end
@job = Job.where(:key => params[:key] || params[:tmp_key]).first
- session[:current_key] = @job.key if action_name != 'clone' #if current_user
+ session[:current_key] = @job.key if action_name != 'clone'
@user = @job.user
#logger.debug("JOB: " + @job.to_json)
- # @job = nil if !readable? @job
end
# Never trust parameters from the scary internet, only allow the white list through.
def job_params
params.require(:job).permit()
end
end
diff --git a/app/views/home/_doc.html.erb b/app/views/home/_doc.html.erb
index 72aa511..d1e0766 100644
--- a/app/views/home/_doc.html.erb
+++ b/app/views/home/_doc.html.erb
@@ -1,515 +1,515 @@
Documentation
Version 1.3 / 04.05.2018
Overview
Genocrunch is a web-based data analysis platform dedicated to metagenomics and metataxonomics. It is tailored to process datasets derived from high-throughput nucleic acids sequencing of microbial communities <%= image_tag('genocrunch_pipeline-1.png', height:'14px') %>, such as gene counts or counts of operational taxonomic units (OTUs) <%= image_tag('genocrunch_pipeline-2.png', height:'14px') %>. In addition to such primary dataset, it also allows the integration of a secondary dataset (e.g. metabolites levels) <%= image_tag('genocrunch_pipeline-3.png', height:'14px') %>.
Genocrunch provides tools covering data pre-processing and transformation, diversity analysis, multivariate statistics, dimensionality reduction, differential analysis, clustering as well as similarity network analysis and associative analysis.
The results of clustering are automatically inferred as additional models into other analyses <%= image_tag('genocrunch_pipeline-4.png', height:'14px') %>.
Interactive visualization is offered for all figures.
Running an analysis session does not require any registration, however, managing multiple analysis (recovering/editing previous work) is done through a personnal index that requires users to sign in. Before signing in, new users need to register.
Registering
Registering is done via the Register button of the top-bar. The registration process requires new users to chose a password. The email address is only used to recover forgotten passwords.
<% if APP_CONFIG[:user_confirmable] %>
In order to complete the registration process, new users need to follow a confirmation link which is automatically sent to the provided email address. In case of issue, this confirmation link can be resent to the provided email address via the Resend confirmation instruction link on the welcome page.
<% end %>
Signing in
Signing in is done from the welcome page, which is accessible via the Sign In button of the top-bar. Before signing in, new users need to complete the registration process.
Trying Genocrunch
The best way to try Genocrunch is to load an example and edit it.
Examples
Examples of analyses are provided via the Examples link of the top-bar. Loading Example data into a new analysis session is possible via the Load button. This allows to visualize and edit Examples without restriction. For signed-in users, it will also copy the example into their personal analysis index.
Analyzing data
Running a new analysis
New analyses can be submitted via the New Analysis button of the topbar. If the user is signed out, the analysis session will be associated to the browser session and remain available as long as the browser is not closed. If the user had been signed-in, the analysis will be stored in his personal index. Analyses parameters are set by filling a single form comprising four parts for Inputs, Pre-processing, Transformation and Analysis. Analyses are then submitted via the Start Analysis button located at the bottom the form. See the Inputs and Tools section below for details.
Editing an analysis
Analyses can be edited via the associated Edit button present on the analysis page and via the edit icons () of the analysis index (for signed-in users only). Editing is performed by updating the analysis form and submiting the modifications via the Restart Analysis button located at the bottom of the form.
Cloning an analysis
Analyses can be copied via the copy icons () of the analysis index (for signed-in users only).
Deleting an analysis
Analyses can be deleted via the delete icons () of the analysis index (for signed-in users only). This process is irreversible.
Inputs and Tools
The analyses are set from a single form composed of four sections: Inputs, Pre-processing, Transformation, Analysis. These sections and their associated parameters are described below.
Inputs
The Inputs section of the analysis creation form allows to upload data files.
General information
Name (mandatory)
The name will appear in the analysis index. It should be informative enough to differentiate from other analyses.
Description
The description can be used to provide additional information about the analysis.
Data files
Primary dataset (mandatory)
The primary dataset is the principal data file. It must contain a data table in the tab-delimited text format with columns representing samples and rows representing observations. The first row must contain the names of samples. The first column must contain the names of the observations. Both columns and rows names must be unique. A column containing a description of each observation, in the form of semi-column-delimited categories, can be added at the end. This format can be obtained from the BIOM format using the biom_convert command.
To increase compatibility, the category column specifies which column in the pimary dataset contains a categorical description of the observations. This must be either the first or the last column.
Map (mandatory)
The map contains information about the experimental design. It must contain a table in the tab-delimited text format with columns representing experimental variables and rows representing samples. The first column must contain the names of the samples as they appear in the primary dataset. The first row must contain the names of experimental variables. Both columns and rows names must be unique. This format is compatible with the mapping file used by the Qiime pipeline.
Format example:
ID Sex Treatment
Spl1 M Treated
Spl2 F Treated
Spl3 M Control
Spl4 F Control
Secondary dataset
The secondary dataset is an optional data file containing additional observations. It must contain a data table in the tab-delimited text format with columns representing samples and rows representing observations. The first row must contain the names of samples as they appear in the primary dataset and the map. The first column must contain the names of the observations. Both columns and rows names must be unique.
The Transformation section of the analysis creation form propose additional modifications for both primary and secondary datasets.
Rarefaction
Although controversial (McMurdie and Holmes, 2014), rarefaction is a popular method amongst microbial ecologists used to correct for variations in sequencing depth, inherent to high-throughput sequencing methods. It consists in random sampling (without replacement) of a fixed number of count from each sample to be compared. The same number of count being drawn out of each sample, this corrects for differences in sequencing depth while conserving the original count distribution among observations. Note that the analysis of diversity is particularly sensitive to variations in sequencing depth.
Sampling depth
This specifies the number of count to be drawn from each sample.
Tip: The maximal sampling depth corresponds to the minimal sequencing depth. First check the sequencing depth of your data by suming the counts in each sample. Then chose a sampling depth accordingly.
N samplings
This specifies how many times the random drawing should be repeated. If more than 1 random sampling is done, the result is the average of all random samplings.
Transformation
Whether it is to correct for sequencing depth, to stabilize the variance or simply to improve the visualization of skewed data, a transformation step may be needed. Common methods include transforming counts into proportions (percent or counts per million) and applying a log. We are working to also propose more advanced transformations, including those developed for RNA sequencing (RNA-seq) as part of the DESeq2 (Love et al., 2014) and the Voom (Law et al., 2014) pipelines as well as methods to limit the effect of known experimental bias (or batch-effect), such as Combat.
Transformation
Specifies which transformation method should be applied.
Analysis
The Analysis section of the analysis creation form sets which statistics to apply and which figures to generate.
Experimental design
This sets the default statistical method to apply for the analysis. Two options are available: Basic and Advanced.
Statistics (Advanced only)
Chose a statistical method to compare groups of samples. If Basic is chosen, this defaults to ANOVA.
Model
Chose a model defining groups of samples. If Basic is chosen, a model can be picked among headers of the map. If Advanced is chosen, the model must be typed by the user in the form of an R formula. The formula must be compatible with the specified statistics. All terms of the formula must refer to column headers of the map.
Examples:
Simple design:
ID Subject Site
Spl1 subject1 Hand
Spl2 subject2 Hand
Spl3 subject3 Hand
Spl4 subject4 Foot
Spl5 subject5 Foot
Spl6 subject6 Foot
Statistics
Model
Comparing sites (parametric)
T-test
Site
Comparing sites (non-parametric)
Wilcoxon rank sum test
Site
Simple paired design:
The order of paired samples in the map is important!
ID Subject Site
Spl1 subject1 Hand
Spl2 subject2 Hand
Spl3 subject3 Hand
Spl4 subject1 Foot
Spl5 subject2 Foot
Spl6 subject3 Foot
Statistics
Model
Comparing sites (parametric)
Paired t-test
Site
Comparing sites (non-parametric)
Wilcoxon signed rank test
Site
Multiple comparisons:
ID Subject Site
Spl1 subject1 Hand
Spl2 subject2 Hand
Spl3 subject3 Hand
Spl4 subject4 Foot
Spl5 subject5 Foot
Spl6 subject6 Foot
Spl7 subject7 Mouth
Spl8 subject8 Mouth
Spl9 subject9 Mouth
Statistics
Model
Comparing sites (parametric)
ANOVA
Site
Comparing sites (non-parametric)
Kruskal-Wallis rank sum test
Site
Multiple comparisons with nesting:
ID Subject Site
Spl1 subject1 Hand
Spl2 subject2 Hand
Spl3 subject3 Hand
Spl4 subject1 Foot
Spl5 subject2 Foot
Spl6 subject3 Foot
Spl7 subject1 Mouth
Spl8 subject2 Mouth
Spl9 subject3 Mouth
Statistics
Model
Comparing sites
Friedman test
Site | Subject
Additive model:
ID Treatment Gender
Spl1 Treated M
Spl2 Treated M
Spl3 Treated M
Spl4 Treated F
Spl5 Treated F
Spl6 Treated F
Spl7 Control M
Spl8 Control M
Spl9 Control M
Spl10 Control F
Spl11 Control F
Spl12 Control F
Statistics
Model
Assessing effects of treatment and gender
ANOVA
Treatment+Gender
Proportions
For each sample, display the proportions of each observation (as percent) in the form of a stacked barchart. This gives an overview of the primary dataset. This analysis is performed before applying any selected transformation.
Diversity
Perform a diversity analysis of the primary dataset. This will display rarefaction curves for the selected diversity metric(s). Rarefaction curves are used to estimate the diversity in function of the sampling depth. A rarefaction curve showing an asymptotic behavior is considered as an indicator of sufficient sampling depth to observe the sample diversity. This analysis is performed before applying any selected transformation.
metric
Chose a diversity metric. Choices include the richness as well as metrics included in the R vegan, ineq and fossil packages. The richness simply represents the number of different observations that are seen within a particular sample.
Compare groups
If selected, diversity between groups will be assessed using statistics and model specified in the experimental design.
perMANOVA
Perform a permutational multivariate analysis of the variance using the Adonis method from R package vegan. This method uses a distance matrix based on the primary dataset.
Distance metric
Chose a distance metric. Choices include metrics available in the R vegan package as well as distances based on correlation coefficients.
Model
For compatibility purpose, this model may differ from the model specified in the experimental design section. This model must be in the form of an R formula compatible with the Adonis function of the R vegan package. All terms of the formula must refer to column headers of the map.
Strata
This is used to specify any nesting in the model. the strata must be compatible with the Adonis function of the R vegan package.
PCA
Perform a principal component analysis (PCA) on the primary dataset. PCA is a commonly used dimensionality reduction method that projects a set of observations onto a smaller set of components that capture most of the variance between samples.
PCoA
Perform a principal coordinate analysis (PCoA) on the primary dataset. The PCoA is a form of dimensionality reduction based on a distance matrix.
Distance metric
Chose a distance metric. Choices include metrics available in the R vegan package as well as distances based on correlation coefficients.
Heatmap
This displays the primary dataset on a heatmap of proportions. The columns (samples) and rows (observations) of the heatmap are re-ordered using a hierarchical clustering. Each observation will be individually compared between groups of samples based on the specified experimental design and associated p-values will be displayed on the side of the heatmap. Individual correlations between observations from the primary dataset and observations from the secondary dataset will also be displayed on the side of the heatmap when available. Samples will be color-coded according to the experimental design.
Changes
This performs a differential analysis. Each observation will be individually compared between groups of samples based on the specified experimental design. Fold-change between groups as well as the associated p-values and mean abundance will be displayed on MA plots and volcano plots.
Correlation network
This builds a network with nodes representing observations and edges representing the correlations. The network will be based on the primary dataset and will include the secondary dataset if available. Observations belonging to the primary and the secondary dataset will be represented using different node shapes. Nodes will be colored to include a number of additional information.
Correlation method
Chose a correlation method. Choices include the Pearson correlation and the Spearman correlation.
Clustering
This applies a clustering algorithm to separate samples into categories. The categories will be added as a new column to the map. Categories will automatically be compared with ANOVA in relevant analysis.
Algorithm
Chose a clustering algorithm. Choices include the k-means algorithm, the k-medoids algorithm (also known as the partitioning around medoids of pam algorithm) and versions of the k-medoids based on distance matrices.
Similarity network
This builds a network with nodes representing samples and edges representing a similarity score. If a secondary dataset is available, three networks will be build: one based on the primary dataset, one based on the secondary dataset and a third based on the fusion of these two networks. The similarity network fusion (SNF) is based on the R SNFtool package. A categorical clustering based on each network is applied.
Metric for primary dataset
Chose the similarity metric to apply on the primary dataset.
Metric for secondary dataset
Chose the similarity metric to apply on the secondary dataset.
Clustering algorithm
Chose the clustering algorithm to assign samples to categories based on each similarity network.
Versions
Genocrunch data analysis tools are based on R libraries. Used libraries are mentionned in the analysis report. The version of the libraries are available on the Versions page via the Infos menu of the topbar).
Analysis report
A detailed analysis report is generated for each analysis. It is available directly after creating/updating a new analysis via the My Analysis button of the topbar or via the corresponding icon () in the analysis index (for signed-in users only). The analysis report includes detailed descriptions of the tools and methods used, possible warning and error logs as well as intermediate files, final files and interactive figures as they become available. Information for the pre-processing and data transformation are available for the primary dataset and the optional secondary dataset in their respective sections. Figures and descriptions for each analysis are available in separate sections. If multiple binning levels were specified, figures for each levels can be accessed via the associated Level button.
Exporting figures
Figures can be exported via the associated Export menu. All figures can be exported as a vector image in the SVG format. Some figures are also available in the PDF format, but PDFs are generated in the same time as the figure raw data and cannot be edited with the interactive visualization tools. Raw figure data are also available either in the JSON format, the TXT format or both. Figure legends are currently only available in the HTML format.
Archive
Files containing initial, intermediate and final data as well as the analysis log and logs for standard output and standard error are automatically included into a .<%= APP_CONFIG[:archive_format] %> archive. This archive can be downloaded via the button of the analysis report page or via the corresponding icon () in the analysis index (for signed-in users only). Although some figures may be automatically included in the archive in the form of a minimal PDF file, the archive does generally not contain figure images. These should be exported separately via the Export menu.
Bugs report
During computation (while the analysis is running), the standard error (stderr) stream is redirected into a bugs report that can be downloaded using the link located at the bottom-right of the details page in the analysis report.
Usage limits
Time limits
Analyses are by default kept for <%= APP_CONFIG[:max_sandbox_job_age] %> days folowing their last update. However, analyses owned by registered users are kept for <%= APP_CONFIG[:max_job_age] %> days folowing their last update. Old analyses are automatically deleted from the server after their reach these limits.
Storage limits
The storage quota is set to <%= number_to_human_size(APP_CONFIG[:max_storage].to_i, precision: 2, prefix: :si) %> per user.
Genocrunch is a web-based data analysis platform dedicated to metagenomics and metataxonomics. It is tailored to process counts datasets derived from high-throughput nucleic acids sequencing of microbial communities <%= image_tag('genocrunch_pipeline-1.png', height:'14px') %>, such as gene counts or counts of operational taxonomic units (OTUs) <%= image_tag('genocrunch_pipeline-2.png', height:'14px') %>. In addition to such primary dataset, it also allows the integration of a secondary dataset (e.g. metabolites levels) <%= image_tag('genocrunch_pipeline-3.png', height:'14px') %>.
Genocrunch provides tools covering data pre-processing and transformation, diversity analysis, multivariate statistics, dimensionality reduction, differential analysis, clustering as well as similarity network analysis and associative analysis.
The results of clustering are automatically inferred as additional models into other analyses <%= image_tag('genocrunch_pipeline-4.png', height:'14px') %>.
Interactive visualization is offered for all figures.
diff --git a/lib/genocrunch_console/lib/genocrunchlib.R b/lib/genocrunch_console/lib/genocrunchlib.R
index b78eeba..2261033 100755
--- a/lib/genocrunch_console/lib/genocrunchlib.R
+++ b/lib/genocrunch_console/lib/genocrunchlib.R
@@ -1,2906 +1,2911 @@
#genocrunchlib.R
################
# Print message in a formatted way so it can be decomposed into categories of
# logs. Can also be used to issue errors
# Args:
# (1) label (character) A label for the message (ex: title, log, warning...)
# (2) text (character) Message to print
# (3) verbose (character) A statement: 'TRUE' or same value as 'label':print
# to stdout; 'FALSE':don't print;
# Prints to stdout:
# Tab-separated: Path to executed script, date and time, 'label' and 'text'
################
log_fp <- NULL
PrintMsg <- function(text='default message', verbose=TRUE, stop=FALSE, log=log_fp) {
if (!is.null(log)) {
f <- file(log, open="at")
sink(file=f, append=TRUE)
sink(type="message")
}
if (as.logical(verbose) == TRUE) {
cat(text)
}
if (!is.null(log)) {
sink()
sink(type="message")
close(f)
}
if (as.logical(stop) == TRUE) {
stop(text)
}
}
################
# Write a table to an output location, with the option of having indented
# columns name if 'name'=''.
# Args:
# (1) table Table object
# (2) output.fp Path to the output file
# (3) name Name of first column
# (4) sep Separator character
# (5) na na
# (6) dec dec
# (7) quote quote
# Outputs:
# Writes 'table' to 'output.fp', starting first line with an indentation ('\t')
################
WriteTable <- function(table=NULL, output.fp='', name='name', sep='\t', na='',
dec='.', quote=FALSE) {
write.table(t(c(name, names(table))), file=output.fp, append=FALSE,
row.names=FALSE, col.names=FALSE, sep=sep, na=na, dec=dec,
quote=quote)
write.table(table, file=output.fp, append=TRUE, row.names=TRUE,
col.names=FALSE, sep=sep, na=na, dec=dec, quote=quote)
}
################
# Reads the "Rfunctions" section of a json file as a json string
# Args:
# (1) json Path to the json file
# Outputs:
# A json string as found in the "Rfunctions" section of the json file
################
ReadJson <- function(file='/../etc/genocrunchlib.json', rel=TRUE) {
suppressMessages(library('rjson'))
if (rel) {
# Consider file as a relative path with respect to this library
args <- commandArgs(trailingOnly=FALSE)
dir <- dirname(sub('--file=', '', args[grep('--file=', args)]))
file <- normalizePath(paste(dir,
file,
sep=''),
mustWork=TRUE)
}
if (!file.exists(file)) {
PrintMsg(paste('"error":"File not found: ',
file,
'."',
sep=''),
verbose=TRUE, stop=TRUE)
}
json_str <- fromJSON(file=file)
return (toJSON(json_str$choices))
}
################################################################################
################################################################################
# MANIPULATION #################################################################
# MANIPULATION #################################################################
# MANIPULATION #################################################################
# MANIPULATION #################################################################
################################################################################
################################################################################
################
# Convert a data frame of factors to a numeric data frame. Unlike data.matrix,
# it will force characters to numeric.
# Args:
# (1) x A data frame
# Returns:
# A numeric version of 'x'
################
ForceNumericDataFrame <- function(x) {
x <- as.matrix(x)
for (i in 1:ncol(x)) {
x[, i] <- as.numeric(as.vector(unlist(x[, i])))
}
return (as.data.frame(x))
}
################
# Convert values of a numeric vector to percents of its sum
# Args:
# (1) x A numeric vector
# Returns:
# A percent version of 'x'
################
Numeric2Percent <- function(x){
return (100*x/sum(x))
}
################
# Scale a numeric vector on a range of values
# Args:
# (1) x A numeric vector
# (2) range A numeric vector containing 2 elements
# (3) force A logical indicating wheter infinite values should be forced into
# 'range'
# Returns:
# A scaled version of 'x', within 'range'
################
RangeScale <- function(x=NULL, range=c(0, 1), force=TRUE) {
scaled <- range[1]*(1-((x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x,
na.rm=TRUE))))+range[2]*((x-min(x, na.rm=TRUE))/(max(x,
na.rm=TRUE)-min(x, na.rm=TRUE)))
if (force == TRUE) {
scaled[scaled == -Inf] <- min(range)
scaled[scaled == Inf] <- max(range)
}
return (scaled)
}
################
# Scale by standard deviation and center on mean
# Args:
# (1) x A numeric vector
# Returns:
# A scaled version of 'x'
################
cScale <- function(x=NULL) {
sd <- sd(x, na.rm=TRUE)
mean <- mean(x, na.rm = TRUE)
if (is.na(sd)) {
sd <- 1
} else if (sd == 0) {
sd <- 1
}
if (is.na(mean)) {
mean <- 0
}
return (x/sd-mean)
}
################################################################################
################################################################################
# JSON HANDELING ###############################################################
# JSON HANDELING ###############################################################
# JSON HANDELING ###############################################################
# JSON HANDELING ###############################################################
################################################################################
################################################################################
################
# Build a json string to store figure data
# Args:
# (1) data (data-frame)
# (2) xmetadata (data-frame) With nrow=ncol(data)
# Returns:
# A json string
################
Dataframe2DataJson <- function(data=NULL, xmetadata=NULL){
# Rules:
# colname(data) -> [{'name'}]
# rowname(data) and data -> [{'data':['rowname(data)':'data']}]
# xmetadata -> [{'rowname(xmetadata)'}]
json <- c(1:ncol(data))
for (i in 1:ncol(data)) {
xmetadata.json <- NULL
if (!is.null(xmetadata)) {
xmetadata.json <- c(1:ncol(xmetadata))
for (j in 1:ncol(xmetadata)) {
xmetadata.json[j] <- paste('"',
names(xmetadata)[j],
'":"',
xmetadata[i, j],
'"',
sep='')
}
xmetadata.json <- paste(xmetadata.json, collapse=',')
}
data.json.el <- c(1:nrow(data))
for (j in 1:nrow(data)) {
data.json.el[j] <- paste('"',
row.names(data)[j],
'":"',
data[j, i],
'"',
sep='')
}
data.json <- paste('{', paste(data.json.el, collapse=','), '}',sep='')
json[i] <- paste('{',
paste(c(paste('"name":"',
names(data)[i],
'"',
sep=''),
xmetadata.json,
paste('"data":',
data.json,
sep='')),
collapse=','),
'}',
sep='')
}
return (paste('[', paste(json, collapse=','), ']', sep=''))
}
################
# Build a json string to store diversity values
# Args:
# (1) diversity (list)
# (2) p.value (list)
# (3) map (matrix)
# Returns:
# A json string
################
diversity2json <- function(diversity=NULL, p.value=NULL, map=NULL){
rar <- row.names(diversity)
nsample <- ncol(diversity)
sample <- c(1:nsample)
for (j in 1:nsample) {
ndata <- nrow(diversity)
data <- c(1:ndata)
for (k in 1:ndata) {
data[k] <- paste('"',
rar[k],
'":',
diversity[k, j],
sep='')
}
metadata <- c(1:ncol(map))
for (k in 1:ncol(map)) {
metadata[k] <- paste('"',
names(map[k]),
'":"',
map[j, k],
'"',
sep='')
}
sample[j] <- paste('{"name":"',
row.names(map)[j],
'",',
paste(metadata, collapse=','),
',"data":{',
paste(data, collapse=','),
'}}',
sep='')
}
if (!is.null(p.value)) {
stat <- c(1:length(p.value))
for (j in 1:length(p.value)) {
s <- c(1:length(p.value[[j]]))
for (k in 1:length(p.value[[j]])) {
s[k] <- paste('{"name":"',
names(p.value[[j]])[k],
'", "p-value":"',
p.value[[j]][1, k],
'"}',
sep='')
}
stat[j] <- paste(s, collapse=',')
}
json <- paste('{"data":[',
paste(sample, collapse=','),
'],"stat":[',
paste(stat, collapse=','),
']}',
sep='')
} else {
json <- paste('{"data":[',
paste(sample, collapse=','),
']}',
sep='')
}
return (json)
}
################
# Build a json tree from dendrogram
# Args:
# (1) dendrogram (list) A dendrogram (nested lists)
# Returns:
# A json string
################
dendrogram2json <- function(dendrogram=NULL, json=NULL, it=0) {
start <- FALSE
if (is.null(json)) {
start <- TRUE
json <- paste('{"name":"node',
it,
'", "children":[',
sep='')
}
json.vect <- c(1:length(dendrogram))
for (i in 1:length(dendrogram)) {
if (length(dendrogram[[i]]) > 1) {
it <- it+1
json.vect[i] <- paste('{"name":"node',
it,
'", "children":[',
sep='')
dendrogram2json.out <- dendrogram2json(dendrogram[[i]], json.vect[i], it)
json.vect[i] <- dendrogram2json.out[[1]]
it <- dendrogram2json.out[[2]]
json.vect[i] <- paste(json.vect[i], ']}', sep='')
} else {
json.vect[i] <- paste('{"name":"',
dendrogram[[i]],
'"}',
sep='')
}
}
if (start) {
json <- paste(json, paste(json.vect, collapse=','), ']}', sep='')
} else {
json <- paste(json, paste(json.vect, collapse=','), sep='')
}
out <- list()
out[1] <- json
out[2] <- it
return (out)
}
################
# Build a json network from similarity matrix
# Args:
# (1) mat list of numeric matrix
# Returns:
# A json string
################
buildFusionNetworkJson <- function(mat=NULL, map=NULL){
names <- colnames(mat[[1]])
ncol <- ncol(mat[[1]])
if (!is.null(map)) {
ncol.map <- ncol(map)
}
nodes <- c(1:ncol)
for (i in 1:ncol){
if (!is.null(map)) {
map.row <- c(1:ncol.map)
for (j in 1:ncol.map) {
map.row[j] <- paste('"',
names(map)[j],
'":"',
map[i, j],
'"',
sep='')
}
} else {
map.row <- '"none":"none"'
}
nodes[i] <- paste('{"id":',
i-1,
',"name":"',
names[i],
'",',
paste(map.row, collapse=','),
'}',
sep='')
}
links <- c(1:(ncol*(ncol/2-1)))
k <- 1
for (i in 1:(ncol-1)) {
for (j in (i+1):ncol) {
weights <- c(1:length(mat))
for (l in 1:length(mat)) {
weights[l] <- mat[[l]][i,j]
}
links[k] <- paste('{"source":',
i-1,
',"target":',
j-1,
',"weight":[',
paste(weights,collapse = ","),
']}',
sep='')
k <- k+1
}
}
return(paste('{"nodes":[', paste(nodes, collapse = ","),'],"links":[', paste(links, collapse = ","), ']}', sep=''))
}
################################################################################
################################################################################
# MODIFICATION #################################################################
# MODIFICATION #################################################################
# MODIFICATION #################################################################
# MODIFICATION #################################################################
################################################################################
################################################################################
################
# Sort a data-frame columns according to a column of another
# Args:
# (1) table (matrix-like object) Table to be sorted
# (2) map (matrix-like object) Map to extract the sorting column from
# (3) column (numeric) Index of the column in map to use for sorting
# (4) verbose (bool) Print messages? see PrintMsg() options
# Returns:
# A sorted version of table
# Print to stdout:
# A description
################
SortTable <- function(table=NULL, map=NULL, verbose=TRUE) {
# print(row.names(map))
# print(map)
# print(map[, column])
PrintMsg('"description":"The table columns were sorted according to the map."', verbose)
return(as.data.frame(table[, as.vector(row.names(map))]))
}
################
# Transpose a data frame (or a matrix-like object)
# Args:
# (1) x (matrix-like object)
# Returns:
# A transposed version of 'x' (numeric dataframe)
################
Transpose <- function(x=NULL) {
if (is.data.frame(x)) {
names <- names(x)
row.names <- row.names(x)
x <- as.data.frame(t(x))
names(x) <- row.names
row.names(x) <- names
} else {
x <- t(x)
}
return (x)
}
################
# Filter numeric table (with category-specific options for OTU-type tables)
# Args:
# (1) table (numeric dataframe) Count table
# (2) category (character) Name of the (optional) category column
# (3) abundance_threshold (numeric) Min % abundance per column [0,100]
# (4) presence_threshold (numeric) Min % presence per row [0,100]
# (5) rm_unassigned (logical) Remove unassigned category ?
# (6) verbose (character) Print messages? see PrintMsg() options
# Returns:
# An filtered version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
FilterTable <- function(table=NULL, category='taxonomy',
abundance_threshold=0.03,
abundance_threshold_type='percent',
presence_threshold=1,
presence_threshold_type='int',
verbose=TRUE) {
nrow.init <- nrow(table)
# First filter based on minimal abundance
msg <- paste('"description":"Removed rows not exceeding ', abundance_threshold, sep='')
filter <- table[, names(table) != category]
minmax.a <- paste(min(filter), max(filter), sep='-')
if (abundance_threshold_type == 'percent') {
filter <- as.data.frame(apply(filter, 2, Numeric2Percent))
msg <- paste(msg, '%', sep='')
minmax.a <- paste(min(filter), '-', max(filter), '%', sep='')
}
max.abundance <- apply(filter, 1, max)
table.filtered <- table[max.abundance >= abundance_threshold, ]
nrow.filtered.a <- nrow(table.filtered)
msg <- paste(msg,
' in at least one column: ',
nrow.init-nrow.filtered.a,
' rows were removed out of ',
nrow.init,
'. Then, removed rows with presence (number of values not equal to zero) lower than ',
presence_threshold,
sep='')
# Then filter based on minimal presence
filter <- table.filtered[, names(table.filtered) != category]
presence <- rowSums(filter != 0)
minmax.p <- paste(min(presence), max(presence), sep='-')
if (presence_threshold_type == 'percent') {
presence <- Numeric2Percent(presence)
msg <- paste(msg, '%', sep='')
minmax.p <- paste(min(presence), '-', max(presence), '%', sep='')
}
table.filtered <- table.filtered[presence >= presence_threshold, ]
nrow.filtered.p <- nrow(table.filtered)
nstop <- nrow(table.filtered)
ab.min <- round(min(filter), digits=4)
ab.max <- round(max.abundance, digits=4)
msg <- paste(msg,
': ',
nrow.filtered.a-nrow.filtered.p,
' rows were removed out of ',
nrow.filtered.a,
' (final number of rows = ',
nrow.filtered.p,
'). Min-max abundance found was ',
minmax.a,
'. Min-max presence found was ',
minmax.p,
'."',
sep='')
PrintMsg(msg, verbose)
if (nrow.filtered.p == 0) {
PrintMsg(paste('"error":"All the rows were filtered out. Please chose an abundance threshold within ', minmax.a, ' and a presence threshold within ', minmax.p, '."', sep=''), verbose, TRUE)
}
return (table.filtered)
}
################
# Bin numeric table by category
# Args:
# (1) table (numeric dataframe) Count table
# (2) aggr.col (character) Name of the column in 'table' to consider for data
# aggregation
# (3) fun (character) Function to use for data aggregation. Valid
# choices: 'mean', 'sum'
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# An aggregated version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
BinTableByCategory <- function(table=NULL, category_col='taxonomy', fun='sum',
level='NA', verbose=TRUE, vstyle=1) {
if ((category_col == '') || !(category_col %in% names(table)) ) {
category_col <- "row names"
category <- row.names(table)
} else {
category <- paste(table[, names(table) == category_col],
'; (',
row.names(table),
')',
sep='')
table <- table[, names(table) != category_col]
}
category <- strsplit(category, ';')
if (vstyle == 1) {
PrintMsg(paste('"description":"Observations were binned by categories (category column:',
category_col,
', function:',
fun,
', level:',
level,
')."',
sep=''), verbose)
} else if (vstyle == 2) {
PrintMsg(paste('"description":"Observations were binned by categories (category column:',
category_col,
', function:',
fun,
'):"',
sep=''), verbose)
}
if (vstyle > 1) {
PrintMsg(paste('"description-item":"Category level ',
level,
'"',
sep=''), verbose)
}
for (i in 1:length(category)) {
category[[i]] <- gsub('^ *.__ *$', '', category[[i]])
category[[i]] <- gsub('^ *', '', category[[i]])
category[[i]] <- gsub(' *$', '', category[[i]])
category[[i]] <- category[[i]][! category[[i]] %in% c('', ' ')]
if (length(category[[i]]) >= level) {
category[[i]] <- paste(category[[i]][1:level], collapse='; ')
} else {
category[[i]] <- paste(category[[i]], collapse='; ')
}
}
category <- list(unlist(category))
name <- 'category'
while (name %in% names(table)) {
name <- paste(name,paste(sample(c(0:9, letters, LETTERS), 3), collapse=''),sep='_')
}
names(category) <- name
table.binned <- aggregate(table,
by=category, FUN=fun)
row.names(table.binned) <- table.binned[, name]
table.binned <- table.binned[, names(table.binned) != name]
return(table.binned)
}
################
# Divide a data frame (or a matrix-like object) by a vector
# Args:
# (1) table (numeric dataframe) Count table
# (2) vect (vector) Vector to divide 'table'
# (3) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A divided (per row) version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
DivTable <- function(table=NULL, vect=NULL, verbose='TRUE') {
PrintMsg('log', 'Divide tables rows by a vector.', verbose=verbose)
return (as.matrix(table) / unlist(vect))
}
################
# Transform to log2 count per milion
# Args:
# (1) table (numeric dataframe) Count table
# (2) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A transformed version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
ApplyLog2Cpm <- function(table=NULL, verbose='TRUE') {
if (length(table[table < 0]) > 0) {
PrintMsg('"error":"The log2cpm transformation was not applied because negative values were detected."',
verbose, TRUE)
return ()
}
if (length(table[table%%1 > 0]) > 0) {
PrintMsg('"warning":"Non-integer values were detected."', verbose)
table <- round(table, digits=0)
}
PrintMsg('"description":"Values were converted into per-million per column and a log-transformation was applied (log2(x+1))."',
verbose)
transformed.table <- as.data.frame(log2(1+(apply(as.matrix(table),
2,
Numeric2Percent))*10000))
names(transformed.table) <- names(table)
row.names(transformed.table) <- row.names(table)
return (transformed.table)
}
################
# Transform to log2
# Args:
# (1) table (numeric dataframe) Count table
# (2) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A transformed version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
ApplyLog2 <- function(table=NULL, verbose=TRUE) {
if (length(table[table < 0]) > 0) {
PrintMsg('"error":"log2(x+1) was not applied because negative values were detected."',
verbose, TRUE)
return ()
}
PrintMsg('"description":"A log-transformation was applied (log2(x+1))."',
verbose)
transformed.table <- as.data.frame(log2(1+(as.matrix(table))))
names(transformed.table) <- names(table)
row.names(transformed.table) <- row.names(table)
return (transformed.table)
}
################
# Transform to percent (per column)
# Args:
# (1) table (numeric dataframe) Count table
# (2) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A transformed version of 'table' (numeric dataframe)
# Prints to stdout:
# A description of the function
################
ApplyCount2Percent <- function(table, verbose=TRUE) {
PrintMsg('"description":"Values were converted into percentages per column."', verbose=verbose)
transformed.table <- as.data.frame(apply(as.matrix(table), 2, Numeric2Percent))
names(transformed.table) <- names(table)
row.names(transformed.table) <- row.names(table)
return (transformed.table)
}
################
# Apply rarefaction
# Args:
# (1) table (numeric dataframe) Count table
# (2) sample (positive integer or 'max') Sampling depth
# (3) nsampling (positive integer) Number of repeated sampling to performe
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A rarefied version of 'table' (a numeric dataframe)
# Prints to stdout:
# A description of the function
################
RarefyTable <- function(table=NULL, sample='max', nsampling=1, verbose=TRUE) {
suppressMessages(library('vegan'))
# Round counts if not integers
if (length(table[table < 0]) > 0) {
PrintMsg('"error":"Rarefaction is not applicable because negative values were detected."', verbose, TRUE)
return ()
}
if (length(table[table%%1 > 0]) > 0) {
PrintMsg('"warning":"Non-integer values were detected. In order to permit rarefaction, these values were rounded to the nearest integer."', verbose)
table <- round(table, digits=0)
}
# Define/check rarefaction depth
min.sample <- min(colSums(table))
if (sample == 'max' || as.numeric(sample) > min.sample) {
sample <- min.sample
}
if (sample < 100) {
PrintMsg(paste('"warning":"Very low counts detected in the following sample(s): ',
paste(names(table)[which(colSums(table) < 100)], collapse=','),
'."',
sep=''), verbose)
} else if (sample < 1000) {
PrintMsg(paste('"warning":"Low counts detected in the following sample(s): ',
paste(names(table)[which(colSums(table) < 1000)], collapse=','),
'."',
sep=''), verbose)
}
# Apply rarefaction
table <- t(table)
table.list <- list()
for (i in 1:as.numeric(nsampling)) {
table.list[[i]] <- rrarefy(table, sample=sample)
}
rarefied.table <- apply(simplify2array(table.list), c(1, 2), mean)
# element-wise mean across rarefied tables
PrintMsg(paste('"description":"The number of observations per samples were equalized using the rarefaction method with a sampling depth of ',
sample,
' (random sampling without replacement) (R package vegan). The rarefied values were calculated as the means of ',
nsampling,
' independent samplings that were rounded to the nearest integer."',
sep=''), verbose=verbose)
return (data.frame(t(round(rarefied.table, digits=0))))
}
################
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (dataframe) Design
# (3) effect (character) comma-delimited list of random effects to
# suppress. Elements must be names of columns in mapping file
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A adjusted version of 'table' (a numeric dataframe)
# Prints to stdout:
# A description of the function
################
SuppressBatchEffect <- function(table=NULL, map=NULL, effect=NULL,
fun='combat', verbose=TRUE) {
method <- list(combat='ComBat', mean_centering='mean centering')
PrintMsg(paste('"description":"Batch effect suppression on ',
effect,
' using',
method[[fun]],
'"'),
verbose)
effect <- unlist(strsplit(effect, split=','))
if (fun == 'mean_centering') {
table <- t(table)
for (i in 1:length(effect)) {
fact <- map[effect[i]]
fact.name <- as.vector(unique(unlist(fact)))
nfact <- length(fact.name)
for (j in 1:ncol(table)) {
data <- data.frame(d=table[, j], fact=fact)
means <- NULL
for (k in 1:nfact) {
means <- append(means, mean(data[fact == fact.name[k], 'd']))
}
for (k in 1:nfact) {
adj <- max(means)-means[k]
data[fact == fact.name[k], 'd'] <- data[fact == fact.name[k], 'd']+adj
}
table[, j] <- data$d
}
}
table <- t(table)
} else if (fun == 'combat') {
suppressMessages(library('sva'))
table <- ComBat(table, map[, effect])
}
return (table)
}
################
# Filter mapping file to keep only terms in model
# Args:
# (1) map (dataframe)
# (2) model (formula vector)
# Returns:
# A filtered map
################
FilterMap <- function(map=NULL, model=NULL) {
# keep only columns of map that are used in the model
if (!is.null(model)) {
term <- list()
for (i in 1:length(model)) {
term[[i]] <-as.vector(unique(unlist(strsplit(model[i], split=c('[^A-Za-z0-9_]')))))
term[[i]] <- term[[i]][! term[[i]] %in% c("", " ", "1", "Error")]
}
term <- as.vector(unlist(term))
row.names <- row.names(map)
if (length(term) > 1) {
map <- as.data.frame(map[, names(map) %in% term])
} else {
map <- as.data.frame(map[, names(map) == term])
names(map) <- term
}
if (ncol(map) < length(unique(term))) {
PrintMsg(paste('"error":"Some terms in model ',
model,
' were not found in the map. Please make sure that all model terms refer to columns headers in the map."',
sep=''),
verbose=TRUE, TRUE)
}
}
return (map)
}
################
# Remove constant rows from table
# Args:
# (1) table (dataframe)
# Returns:
# A filtered table
################
RemoveConstantRows <- function(table=NULL, verbose=TRUE) {
filter <- rep(TRUE, nrow(table))
for (i in 1:nrow(table)) {
if (length(as.vector(unique(unlist(table[i, ])))) < 2) {
filter[i] <- FALSE
PrintMsg(paste('"warning":"Row ', i, ' (', row.names(table)[i], ') contained essentially constant data and was removed."', sep=''),
verbose=verbose)
}
}
return (table[filter == TRUE, ])
}
################################################################################
################################################################################
# ANALYSIS #####################################################################
# ANALYSIS #####################################################################
# ANALYSIS #####################################################################
# ANALYSIS #####################################################################
################################################################################
################################################################################
################
# Apply a clustering algorithm
# Args:
# (1) table (numeric dataframe) Count table
# (2) fun (character) Clustering function (algorithme)
# (3) json (character) Json string specifying packages
# (4) verbose (character) Print messages? see PrintMsg() options
# (5) graphical (logical) Generate graphics?
# Returns:
# A table of clusters
# Output to device:
# Depending on 'fun', a figure
# Prints to stdout:
# A description of the function
################
PerformClustering <- function(table=NULL, fun=c('kmeans', 'pam', 'pam-bray', 'pam-jaccard'), json=libJson,
verbose=TRUE, graphical=TRUE) {
if (ncol(table) < 10 || nrow(table) < 3) {
PrintMsg('"warning":"Not enough data to perform clustering (min 10 column and 3 rows)."',
verbose)
return (NULL)
} else if (ncol(table) > 65536) {
PrintMsg('"warning":"Sorry, the dataset is too large for clustering algorithms (max 65536 columns)."',
verbose)
return (NULL)
}
spec <- as.data.frame(do.call('rbind', json$clustering))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"Clusters were determined using the ', spec$label,
package, '."', sep=''),
verbose)
table <- t(table)
if (fun == 'kmeans') {
if (graphical == TRUE) {
kmean.out <- kmeansruns(table, criterion='asw', plot=F)$cluster # figure margins too large issue...
} else {
kmean.out <- kmeansruns(table, criterion='asw')$cluster
}
cluster <- paste('kmean',
kmean.out,
sep ='')
} else if (fun == 'pam') {
pam.out <- pamk(table, usepam=TRUE)
if (graphical == TRUE) {
plot(pam.out$pamobject, which.plots=2)
}
cluster <- paste('pam',
pam.out$pamobject$cluster,
sep ='')
} else if (fun == 'pam-bray') {
dist.data <- ComputeDistance(table=table, fun='bray', json=json,
verbose=verbose)
pam.out <- pamk(dist.data, usepam=TRUE)
if (graphical == TRUE) {
plot(pam.out$pamobject, which.plots=2)
}
cluster <- paste('pam',
pam.out$pamobject$cluster,
sep ='')
} else if (fun == 'pam-jaccard') {
dist.data <- ComputeDistance(table=table, fun='jaccard', json=json,
verbose=verbose)
pam.out <- pamk(dist.data, usepam=TRUE)
if (graphical == TRUE) {
plot(pam.out$pamobject, which.plots=2)
}
cluster <- paste('pam',
pam.out$pamobject$cluster,
sep ='')
} else {
return(NULL)
}
cluster <- as.data.frame(cluster)
row.names(cluster) <- row.names(table)
names(cluster) <- 'Clustering'
PrintMsg(paste('"description":"Clusters: ',
paste(paste(row.names(cluster), unlist(cluster), sep=':'), collapse=', '),
'."', sep=''),
verbose)
return (cluster)
}
################
# Build a relative abundance stacked barplot
# Args:
# (1) table (numeric dataframe) Count table
# (2) threshold (numeric) Abundance cutoff for display in percent [0;100]
# (3) verbose (character) Print messages? see PrintMsg() options
# (4) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
AnalyseProportions <- function(table=NULL,
map=NULL,
verbose=TRUE, graphical=TRUE) {
# Convert to relative abundances per column
PrintMsg('"description":"Data was converted to proportions (%) per sample."',
verbose)
if (min(table) < 0) {
PrintMsg('"warning":"Negative values were found! Negative values are not supported by this analysis."',
verbose)
table <- as.data.frame(abs(table))
}
table <- as.data.frame(apply(table, 2, Numeric2Percent))
# Order by decreasing values
table <- as.data.frame(table[order(rowMeans(table), decreasing=TRUE), ])
# Build stacked barchart
if (as.logical(graphical) == TRUE) {
col <- rainbow(nrow(table))
par(mar=c(4, 4, 2, 4), xpd=TRUE)
barplot(as.matrix(table),
ylim=c(0, max(colSums(table))),
space=0,
border=NA,
names.arg=names(table),
ylab='%',
col=col,
las=2,
bty="n",
cex.names=1.84906*ncol(table)^-0.2898)
# Build legend in a separate figure
plot.new()
legend("topleft", row.names(table), cex=0.8,
fill=col, horiz=FALSE)
}
return (Dataframe2DataJson(data=table, xmetadata=map))
}
################
# Calculate diversity per column
# Args:
# (1) table (numeric dataframe) Count table
# (2) fun (character) The diversity function to use
# (3) json (character) Json string specifying packages
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A vector of length=ncol(table)
# Prints to stdout:
# A description of the function
################
ComputeDiversity <- function(table=NULL, fun='shannon', json=libJson,
verbose=TRUE) {
spec <- as.data.frame(do.call('rbind', json$diversity))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
} else {
pkg <- ''
}
PrintMsg(paste('"description":"The ',
spec$label,
package,
' was calculated for each sample."', sep=''), verbose)
diversity <- NULL
if (pkg == 'vegan') {
for (i in 1:ncol(table)) {
diversity <- append(diversity, diversity(table[, i], index=fun, MARGIN=0,
base=2), after=length(diversity))
}
} else if (pkg == 'ineq') {
for (i in 1:ncol(table)) {
diversity <- append(diversity, ineq(table[, i], parameter=NULL, type=fun,
na.rm=TRUE), after=length(diversity))
}
} else if (fun == 'richness') {
for (i in 1:ncol(table)) {
diversity <- append(diversity, sum(table[, i] != 0),
after=length(diversity))
}
} else if (fun == 'chao1') {
diversity <- chao1(table, taxa.row = TRUE)
} else {
PrintMsg(paste('"error":"Unknown argument to fun (2):', fun, '"', sep=' '),
verbose, TRUE)
}
return (diversity)
}
################
# Compute correlations
# Args:
# (1) table (numeric dataframe) Count table
# (2) fun (character) The correlation methode to use
# (3) json (character) Json string specifying packages
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A vector of length=ncol(table)
# Prints to stdout:
# A description of the function
################
ComputeCorrelation <- function(table=NULL, fun='pearson', test=FALSE,
json=libJson, verbose=TRUE) {
spec <- as.data.frame(do.call('rbind', json$correlation))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"Calculated ', spec$label, package, '."', sep=''),
verbose=verbose)
if (as.logical(test) == TRUE) {
cor.output <- list()
cor.output[['estimate']] <- matrix(ncol=nrow(table), nrow=nrow(table))
cor.output[['p.value']] <- matrix(ncol=nrow(table), nrow=nrow(table))
for (i in 2:nrow(table)) {
for (j in 1:(i-1)) {
out <- cor.test(unlist(table[i, ]), unlist(table[j, ]), method=fun)
cor.output[['estimate']][i, j] <- out$estimate
cor.output[['p.value']][i, j] <- out$p.value
}
}
cor.output[['estimate']][upper.tri(cor.output[['estimate']])] <- t(cor.output[['estimate']])[upper.tri(cor.output[['estimate']])]
diag(cor.output[['estimate']]) <- 1
cor.output[['p.value']][upper.tri(cor.output[['p.value']])] <- t(cor.output[['p.value']])[upper.tri(cor.output[['p.value']])]
diag(cor.output[['p.value']]) <- 0
return (cor.output)
}
if (fun == 'pearson') {
return (cor(t(table), method=fun, use='pairwise.complete.obs'))
} else {
return (cor(t(table), method=fun, use='na.or.complete'))
}
}
################
# Compute distances
# Args:
# (1) table (numeric dataframe) Count table
# (2) fun (character) The distance function to use
# (3) json (character) Json string specifying packages
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A vector of length=ncol(table)
# Prints to stdout:
# A description of the function
################
ComputeDistance <- function(table=NULL, fun='bray', json=libJson,
verbose=TRUE) {
spec <- as.data.frame(do.call('rbind', json$distance))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
if (fun %in% c('pearson', 'spearman')) {
PrintMsg(paste('"description":"A distance matrix was calculated based on ',
spec$label, package, '."', sep=''), verbose=verbose)
return (as.dist(as.matrix(ComputeCorrelation(table, fun=fun, json=json,
verbose=FALSE))))
} else {
# Correct for negative entries by adding a scalar (Caillez correction method)
if (min(table) < 0) {
table <- table+abs(min(table))
PrintMsg(paste('"description":"A Caillez correction was applied and a distance matrix was calculated based on ',
spec$label, package, '."', sep=''), verbose=verbose)
} else {
PrintMsg(paste('"description":"A distance matrix was calculated based on ',
spec$label, package, '."', sep=''), verbose=verbose)
}
if ('vegan' %in% pkg) {
return (vegdist(table, method=fun, binary=FALSE, diag=TRUE, upper=TRUE))
}
}
}
################
# Test if model is balanced
# Args:
# (1) map (data-frame) Experimental design table
# (2) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (3) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A bool indicating whether the design is balanced or not
################
ModelIsBalanced <- function(map=NULL, model=NULL, verbose='TRUE') {
# keep only columns of map that are used in the model
map <- FilterMap(map=map, model=model)
for (i in 1:ncol(map)) {
fact <- as.vector(unique(unlist(map[, i])))
n <- length(map[map[, i] == fact[1], i])
if (length(fact) > 1) {
for (j in 2:length(fact)) {
if (length(map[map[, i] == fact[j], i]) != n) {
PrintMsg('log', paste('Model ',model , ' is not balanced.', sep=''), verbose=verbose)
return (FALSE)
}
}
}
}
PrintMsg('note', paste('Model ',model , ' is balanced.', sep=''), verbose=verbose)
return (TRUE)
}
################
# Test if model's variables could be nested
# Args:
# (1) map (data-frame) Experimental design table
# (2) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (3) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A list of nested variables
################
ModelIsNested <- function(map=NULL, model=NULL, verbose='TRUE') {
# keep only columns of map that are used in the model
map <- FilterMap(map=map, model=model)
if (ncol(map) < 2) {
return (NULL)
}
out <- list()
out.txt <- NULL
for (i in 1:ncol(map)) {
fact <- as.vector(unique(unlist(map[, i])))
test <- as.data.frame(map[, -i])
names(test) <- names(map)[-i]
out[[names(map)[i]]] <- NULL
for (j in 1:ncol(test)) {
group1 <- as.vector(test[map[, i] == fact[1], j])
nested <- TRUE
if (length(as.vector(unique(map[, i]))) >= length(as.vector(unique(test[, j])))) {
nested <- FALSE
} else if (length(as.vector(unique(group1))) != length(group1)) {
nested <- FALSE
} else {
for (k in 2:length(fact)) {
if (!identical(group1, as.vector(test[map[, i] == fact[k], j]))) {
nested <- FALSE
break
}
}
}
if (nested == TRUE) {
out[[names(map)[i]]] <- append(out[[names(map)[i]]], names(test)[j], after=length(out[[names(map)[i]]]))
out.txt <- append(out.txt, paste(names(test)[i], names(map)[j], sep='/'), after=length(out.txt))
}
}
}
if (length(out) == 0) {
return (NULL)
}
PrintMsg('log', paste('The following nesting was detected in model ',
model,
': ',
paste(out.txt, collapse=', '),
' Please control that the model is the one you want.',
sep=''), verbose=verbose)
return (out)
}
################
# Compute statistics
# Args:
# (1) response (numeric vector)
# (2) map (data-frame) Experimental design table with
# nrow=length(response)
# (3) method (character) Statistics to use
# (4) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (5) pairwise (bool) Perform pairwise comparision?
# (6) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A data-frame
# Prints to stdout:
# A description of the function
################
ComputeStats <- function(response=NULL, map=NULL, method='anova',
model=NULL, pairwise=TRUE, verbose=TRUE) {
if (is.null(method)) {
method <- 'anova'
PrintMsg('"warning":"The statistical method was not specified. An ANOVA will be performed by default."', verbose)
}
# If no model is specified, assume all columns in map are fixed variables
if (is.null(model)) {
model <- paste(names(map), collapse='+')
PrintMsg(paste('"warning":"No model was specified. Based on the map, the following model will be used by default: ',
model,
'."',
sep=''), verbose)
}
# Test that each factor has >= 2 levels
term <- as.vector(unique(unlist(strsplit(model, split=c('[^A-Za-z0-9_]')))))
term <- term[! term %in% c("", " ", "1", "Error")]
for (i in 1:length(term)) {
if (!term[i] %in% names(map)) {
PrintMsg(paste('"error":"The following model factor was not found in map: ',
term[i],
'. Please correct the model or correct the map."',
sep=''), verbose=TRUE, stop=TRUE)
}
levels <- as.vector(unique(unlist(map[, term[i]])))
if (length(levels) < 2) {
PrintMsg(paste('"error":"The following model factor has less than 2 levels: ',
term[i],
'. Please correct the model or correct the map."',
sep=''), verbose=TRUE, stop=TRUE)
}
for (j in 1:length(levels)) {
if (length(map[map[, term[i]] == levels[j], ]) == 1) {
PrintMsg(paste('"warning":"The following group contains only one element (variance is null): ',
levels[j],
'. This will impact on statistics."',
sep=''), TRUE)
}
}
}
response.name <- 'response'
while (response.name %in% names(map)) {
response.name <- paste('response', paste(sample(c(letters, LETTERS, c(0:9)), 4), collapse=''), sep='')
}
names(response) <- response.name
formula <- paste(response.name, ' ~ ', model, sep='')
# Build the dataframe
data <- map
data <- cbind(map, response)
pairwise.output <- NULL
if (method == 'anova' || is.null(method)) {
# PARAMETRIC, ACCEPTS MODELS WITH >2 FACTORS, MULTIPLE FIXED VARIABLES AND NESTING
# MODEL CAN BE OF THE FORM "a*b+c"
PrintMsg(paste('"description":"An analysis of variance (ANOVA) was performed using the model: ', model,
'."', sep=''), verbose)
aov.output <- aov(formula=as.formula(formula), data=data)
if (pairwise == TRUE) {
pairwise.output <- TukeyHSD(aov.output)
}
summary <- as.data.frame(summary(aov.output)[[1]])
if ('Pr(>F)' %in% names(summary)) {
p.value <- summary[, 'Pr(>F)']
} else {
PrintMsg(paste('"warning":"Impossible to calculate a p-value. Please make sure that the model ',
model, ' is not saturated."',
sep=''), verbose)
p.value <- rep('NA', nrow(summary))
}
summary <- data.frame(row.names=gsub('[[:space:]]', '', row.names(summary)),
p.value=p.value)
#explained=summary[, 'Sum Sq']/sum(summary[, 'Sum Sq']))
} else if (method == 'friedman') {
# NON-PARAMETRIC, ACCEPTS MODELS WITH >2 FACTORS BUT REQUIRES EXACTLY 2 NESTED VARIABLE
# MODEL HAS TO BE OF THE FORM "a | b"
friedman.terms <- unlist(strsplit(model, split=' '))
if (length(friedman.terms) != 3 || friedman.terms[2] != '|') {
PrintMsg(paste('"error":"The model must be of the form a | b when using Friedman test. Please provide an appropriate model. The current model is: ',
model, '."', sep=''), verbose, TRUE)
}
friedman.output <- friedman.test.with.post.hoc(formula=as.formula(formula), data=data)
PrintMsg(paste('"description":"A Friedman test was performed using the model: ', model,
'."', sep=''), verbose)
if (pairwise == TRUE) {
pairwise.output <- list()
pairwise.output[[term[1]]] <- data.frame(row.names=paste(unique(unlist(map[, term[1]])), collapse='-'),
p=friedman.output$PostHoc.Test)
PrintMsg('"description":"A post-hoc test with correction for multiple comparisions was performed."', verbose)
}
summary <- data.frame(row.names=term[1],
p.value=friedman.output$Friedman.Test)
} else if (method == 'kruskal') {
# NON-PARAMETRIC, ACCEPTS MODELS WITH >2 FACTORS BUT ONLY 1 FIXED VARIABLE
# MODEL CAN BE OF THE FORM "a"
if (length(term) > 1) {
PrintMsg(paste('"error":"The model cannot include more than 1 variable when using Kruskal-Wallis rank sum test. Please provide an appropriate model. The current model is: ',
model, ' (', length(term), ' variables)."', sep=''), verbose, TRUE)
}
suppressMessages(library('dunn.test'))
dunn.output <- dunn.test(data[, response.name], g=map[, term[1]], kw=FALSE, table=FALSE, list=FALSE)
PrintMsg(paste('"description":"A Kruskal-Wallis rank sum test was performed using the model: ', model,
'."', sep=''), verbose)
if (pairwise == TRUE) {
pairwise.output <- list()
pairwise.output[[term[1]]] <- data.frame(row.names=paste(unique(unlist(map[, term[1]])), collapse='-'),
p=dunn.output$P)
PrintMsg('"description":"A post-hoc test with correction for multiple comparisions (R library {dunn.test}) was performed."', verbose)
}
summary <- data.frame(row.names=term[1],
p.value=pchisq(dunn.output$chi2, nrow(data)-1, lower.tail=FALSE))
} else if (method == 'wilcox') {
# NON-PARAMETRIC, ACCEPTS MODELS WITH 2 FACTORS ONLY AND ONE FIXED VARIABLES
# MODEL HAS TO BE OF THE FORM "a"
if (length(term) > 1) {
PrintMsg(paste('"error":"The model cannot include more than 1 variable when using Wilcoxon rank sum test. Please provide an appropriate model. The current model is: ',
model, ' (', length(term), ' variables)."', sep=''), verbose, TRUE)
}
- samples <- unique(unlist(data[, model]))
+ samples <- as.vector(unlist(unique(unlist(data[, model]))))
if (length(samples) != 2) {
PrintMsg(paste('"error":"The Wilcoxon rank sum test can only perform two samples comparison. Please provide an appropriate model. The current model includes the following samples: ',
sample, '."', sep=''), verbose, TRUE)
}
wilcox.output <- wilcox.test(formula=as.formula(formula), data=data)
PrintMsg(paste('"description":"A Wilcoxon rank sum test was performed using the model: ', model,
'."', sep=''), verbose)
summary <- data.frame(row.names=paste(samples, collapse='-'),
p.value=wilcox.output$p.value)
pairwise.output <- list()
pairwise.output[[model]] <- as.data.frame(wilcox.output$p.value)
names(pairwise.output[[model]]) <- model
row.names(pairwise.output[[model]]) <-paste(samples, collapse='-')
} else if (method == 'wilcox_paired') {
# NON-PARAMETRIC, ACCEPTS MODELS WITH 2 FACTORS ONLY AND ONE FIXED VARIABLES. NESTING IS ASSUMED FROM THE ORDER OF THE DATA!
# MODEL HAS TO BE OF THE FORM "a"
if (length(term) > 1) {
PrintMsg(paste('"error":"The model cannot include more than 1 variable when using Wilcoxon signed rank test. Please provide an appropriate model. The current model is: ',
model, ' (', length(term), ' variables)."', sep=''), verbose, TRUE)
}
samples <- unique(unlist(data[, model]))
if (length(samples) != 2) {
PrintMsg(paste('"error":"The Wilcoxon signed rank test can only perform two samples paired comparison. Please provide an appropriate model. the current model includes the following samples: ',
- paste(sample, collapse=', '),
+ paste(samples, collapse=', '),
'."',
sep=''),
verbose, TRUE)
}
x <- data[data[, model] == samples[1], model]
y <- data[data[, model] == samples[2], model]
if (length(x) != length(y)) {
PrintMsg(paste('"error":"A paired comparison in Wilcoxon signed rank test is possible only if samples have same size. Please provide an appropriate model, correct the map or use different statistics. Current samples are: ',
samples[1],
'(n=',
length(x),
'), ',
samples[2],
'(n=',
length(y),
')."',
sep=''),
verbose, TRUE)
}
wilcox.output <- wilcox.test(formula=as.formula(formula), data=data, paired=TRUE)
PrintMsg(paste('"description":"A Wilcoxon signed rank test was performed using the model: ',
model,
'. Paires: ',
paste(paste(row.names(data)[data[, model] == samples[1]], row.names(data)[data[, model] == samples[2]], sep='-'), collapse=', '),
'."', sep=''), verbose)
summary <- data.frame(row.names=paste(samples, collapse='-'),
p.value=wilcox.output$p.value)
pairwise.output <- list()
pairwise.output[[model]] <- as.data.frame(wilcox.output$p.value)
names(pairwise.output[[model]]) <- model
row.names(pairwise.output[[model]]) <-paste(samples, collapse='-')
} else if (method == 'ttest') {
# PARAMETRIC, ACCEPTS MODELS WITH 2 FACTORS ONLY AND ONE FIXED VARIABLES.
# MODEL HAS TO BE OF THE FORM "a"
if (length(term) > 1) {
PrintMsg(paste('"error":"The model cannot include more than 1 variable when using t-test. Please provide an appropriate model. The current model is: ',
model, ' (', length(term), ' variables)."', sep=''), verbose, TRUE)
}
samples <- unique(unlist(data[, model]))
if (length(samples) != 2) {
PrintMsg(paste('"error":"A t-test can only perform two samples comparison. Please provide an appropriate model. The current model includes the following samples: ',
- paste(sample, collapse=', '),
+ paste(samples, collapse=', '),
'."',
sep=''),
verbose, TRUE)
}
ttest.output <- t.test(formula=as.formula(formula), data=data)
PrintMsg(paste('"description":"A paired t-test was performed using the model: ',
model,
'."',
sep=''),
verbose)
summary <- data.frame(row.names=paste(samples, collapse='-'),
p.value=ttest.output$p.value)
pairwise.output <- list()
pairwise.output[[model]] <- as.data.frame(ttest.output$p.value)
names(pairwise.output[[model]]) <- model
row.names(pairwise.output[[model]]) <-paste(samples, collapse='-')
} else if (method == 'ttest_paired') {
# PARAMETRIC, ACCEPTS MODELS WITH 2 FACTORS ONLY AND ONE FIXED VARIABLES. NESTING IS ASSUMED FROM THE ORDER OF THE DATA!
# MODEL HAS TO BE OF THE FORM "a"
if (length(term) > 1) {
PrintMsg(paste('"error":"A model cannot include more than 1 variable when using t-test. Please provide an appropriate model. The current model is: ',
model, ' (', length(term), ' variables)."', sep=''), verbose, TRUE)
}
samples <- unique(unlist(data[, model]))
if (length(samples) != 2) {
PrintMsg(paste('"error":"A paired t-test can only perform two samples comparison. Please provide an appropriate model. The current model includes the following samples: ',
- paste(sample, collapse=', '),
+ paste(samples, collapse=', '),
'."',
sep=''),
verbose, TRUE)
}
x <- data[data[, model] == samples[1], model]
y <- data[data[, model] == samples[2], model]
if (length(x) != length(y)) {
PrintMsg(paste('"error":"A paired comparison in t-test is possible only if samples have same size. Please provide an appropriate model, correct the map or use different statistics. The current samples are: ',
samples[1],
'(n=',
length(x),
'), ',
samples[2],
'(n=',
length(y),
')."',
sep=''),
verbose, TRUE)
}
ttest.output <- t.test(formula=as.formula(formula), data=data, paired=TRUE)
PrintMsg(paste('"description":"A paired t-test was performed using the model: ',
model,
'. Paires: ',
paste(paste(row.names(data)[data[, model] == samples[1]], row.names(data)[data[, model] == samples[2]], sep='-'), collapse=', '),
'."', sep=''), verbose)
summary <- data.frame(row.names=paste(samples, collapse='-'),
p.value=ttest.output$p.value)
pairwise.output <- list()
pairwise.output[[model]] <- as.data.frame(ttest.output$p.value)
names(pairwise.output[[model]]) <- model
row.names(pairwise.output[[model]]) <-paste(samples, collapse='-')
}
stat <- list(summary=summary, pairwise=pairwise.output)
return (stat)
}
################
# Plot rarefaction curves
# Args:
# (1) data (numeric matrix) rarefaction table
# Output to device:
# A figure
################
PlotRarefactionCurves <- function(data=NULL, xlab='Rarefaction depth', ylab='') {
color.palette <- rainbow(ncol(data))
plot(x=row.names(data),
y=c(1:nrow(data)),
type="n",
ylim=c(min(data), max(data)),
xlab=xlab,
ylab=ylab)
for (i in 1:ncol(data)) {
lines(x=row.names(data),
y=data[, i],
col=color.palette[i])
}
return ()
}
################
# Perform a diversity analysis
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) fun (character) The diversity function to use
# (4) model (formula vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (5) json (character) Json string specifying packages
# (6) verbose (character) Print messages? see PrintMsg() options
# (7) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
AnalyseDiversity <- function(table=NULL, map=NULL, fun=c('richness', 'shannon'),
nrar=20, compare_diversity=FALSE, stats=NULL, model=NULL,
json=libJson, verbose=TRUE, graphical=TRUE) {
if (is.null(fun)) {
fun <- 'richness'
}
suppressMessages(library('vegan'))
fun <- fun[fun != '']
# Round counts if not integers
if (length(table[table < 0]) > 0) {
PrintMsg('"error":"Rarefaction is not applicable because negative values were detected."', verbose, TRUE)
return ()
}
if (length(table[table%%1 > 0]) > 0) {
PrintMsg('"warning":"Non-integer values were detected. In order to permit rarefaction, these values were rounded to the nearest integer."', verbose)
table <- round(table, digits=0)
}
# Compute rarefaction curves for diversity
colsums <- colSums(table)
min.colsum <- floor(min(colsums))
ncol <- ncol(table)
sample <- matrix(rep(floor(seq(from=0, to=min.colsum, length.out=nrar)), ncol), ncol=ncol, byrow=FALSE)
diversity <- list()
map <- FilterMap(map=map, model=model)
rarefied.table <- list()
for (i in 1:nrar) {
rarefied.table[[i]] <- rrarefy(t(table), sample=apply(rbind(sample[i, ], colsums), 2, min))
}
PrintMsg('"description":"This is a diversity analysis."', verbose)
PrintMsg(paste('"description":"The dataset was rarefied until ', min.colsum, ' counts per sample."', sep=''), verbose)
data.json <- c(1:length(fun))
for (i in 1:length(fun)) {
div <- matrix(ncol=ncol, nrow=0)
v <- TRUE
for (j in 1:nrar) {
div <- rbind(div,
ComputeDiversity(table=t(rarefied.table[[j]]),
fun=fun[i],
json=json,
verbose=v))
v <- FALSE
}
colnames(div) <- colnames(table)
row.names(div) <- sample[, 1]
diversity[[i]] <- as.matrix(div)
# Build figures
if (graphical == TRUE) {
PlotRarefactionCurves(div, ylab=fun[i])
}
# Calculate statistics
if (compare_diversity == TRUE) {
p.value <- list()
for (j in 1:length(model)) {
if (length(stats) == length(model)) {
method <- stats[j]
} else {
method <- stats[1]
}
stat <- ComputeStats(response=div[nrar, ], map=map, method=method,
model=model[j], pairwise=FALSE, verbose=TRUE)$summary
p <- as.data.frame(t(as.matrix(stat$p.value)))
names(p) <- row.names(stat)
p <- as.data.frame(p[, names(p) != 'Residuals'])
# the following is usefull in case model contains only one element
names(p) <- row.names(stat)[row.names(stat) != 'Residuals']
row.names(p) <- fun[i]
p.value[[j]] <- p
}
PrintMsg(paste('"description":"The statistics were performed at a rarefaction depth of ', min.colsum, ' counts per sample."', sep=''), verbose)
} else {
p.value <- NULL
}
data.json[i] <- paste('"', fun[i], '":', diversity2json(diversity[[i]], p.value, map), sep='')
}
return (paste('{', paste(data.json, collapse=','), '}', sep=''))
}
################
# Permutational Multivariate Analysis of Variance Using Distance Matrices
# Args:
# (1) model (character)
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) fun (character) The distance function to use
# (4) graphical (logical) Generate graphics?
# Returns:
# A summary (data-frame)
# Prints to stdout:
# A description of the function
################
Adonis <- function(table=NULL, model=NULL, strata=NULL, map=NULL,
fun='bray', graphical=TRUE) {
if (!is.null(model)) {
formula <- paste('table ~ ', model, sep='')
} else {
formula <- paste('table ~ ', paste(names(map), collapse='*'), sep='')}
if (!is.null(strata)) {
if (strata == '') {
strata == NULL
} else {
#formula <- paste(formula, strata, sep=' / ')
strata <- map[, strata]
}
}
adonis.output <- adonis(formula=as.formula(formula), data=as.data.frame(map),
strata=strata, method=fun,
permutations=1000)
summary <- data.frame(row.names=row.names(adonis.output$aov.tab))
summary <- data.frame(row.names=gsub('[[:space:]]', '',
row.names(adonis.output$aov.tab)),
p.value=adonis.output$aov.tab[, 'Pr(>F)'],
explained=adonis.output$aov.tab[, 'R2'])
summary <- summary[row.names(summary) != 'Total', ]
# Build the figure
if (graphical == TRUE) {
pie(summary$explained,
labels=row.names(summary),
col=rainbow(length(summary$explained)),
init.angle=0,
radius=0.7,
cex=0.7,
xpd=TRUE)
legend(x=-1, y=-1, xjust=1, yjust=0.5,
title='p-value',
legend=round(summary$p.value, digits=4),
fill=rainbow(nrow(summary)),
cex=0.7, xpd=TRUE)
}
return (summary)
}
################
# Permutational Multivariate Analysis of Variance Using Distance Matrices
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) fun (character) The distance function to use
# (4) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# must refer to names(map)
# (5) verbose (character) Print messages? see PrintMsg() options
# (6) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
PerformAdonis <- function(table=NULL, map=NULL, fun='bray',
model=NULL, strata=NULL,
json=libJson, verbose=TRUE, graphical=TRUE) {
suppressMessages(library('vegan'))
#out.json <- c(1:length(fun))
#for (k in 1:length(fun)) {
spec <- as.data.frame(do.call('rbind', json$distance))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"This is a permutational multivariate analysis of variance using the Adonis method (R package {vegan}) based on the ',
spec$label,
package,
'."',
sep=''),
verbose)
# Correct for negative entries by adding a scalar (Caillez correction method)
if (min(table) < 0) {
c <- abs(min(table))
table <- table+c
PrintMsg(paste('"description":"Negative values were corrected using a Caillez correction (',
c,
' added to all values)."',
sep=''),
verbose)
}
if (is.null(strata) || strata == '') {
strata <- rep(NULL, length(model))
}
if (!is.null(strata) && length(strata) != length(model)) {
PrintMsg(paste('"warning":"Number of strata (',
length(strata),
') differs from number of models(',
length(model),
'). No strata will be used."',
sep=''),
verbose)
strata <- rep(NULL, length(model))
}
# Perform perMANOVA
#map <- FilterMap(map=map, model=model)
data.json <- c(1:length(model))
for (i in 1:length(model)) {
stat <- Adonis(table=t(table),
model=model[i], strata=strata[i], map=map,
fun=fun, graphical=graphical)
stat.json <- c(1:nrow(stat))
for (j in 1:nrow(stat)) {
stat.json[j] <- paste('{"name":"',
row.names(stat)[j],
'","p-value":"',
stat$p.value[j],
'","explained":"',
stat$explained[j],
'"}',
sep='')
}
data.json[i] <- paste('"',
model[i],
'":[',
paste(stat.json, collapse=','),
']',
sep='')
}
#out.json[k] <- paste('"',
# fun[k],
# '":"{',
# paste(data.json, collapse=','),
# '}',
# sep='')
#}
#return (paste('{',paste(out.json, collapse=','), '}', sep=''))
return (paste('{',paste(data.json, collapse=','), '}', sep=''))
}
################
# Principal Component Analysis (PCA)
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) col (character) Name of a column in map to use to define colors
# (5) biplot (numeric) Show n=biplot factors with biggest norms on the
# biplot
# (6) verbose (character) Print messages? see PrintMsg() options
# (7) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
PerformPCA <- function(table=NULL, map=NULL, biplot=TRUE,
verbose=TRUE, graphical=TRUE) {
suppressMessages(library('FactoMineR'))
ncp <- min(max(2, ncol(table)), 5)
pca.output <- PCA(t(table), ncp=ncp, scale.unit=FALSE, graph=FALSE)
if (graphical == TRUE) {
plot(pca.output, new.plot=FALSE)
}
# Pack data into a json string
data.ind <- as.data.frame(t(as.data.frame(pca.output$ind$coord)))
row.names(data.ind) <- gsub('Dim.', 'PC', row.names(data.ind))
data.json <- Dataframe2DataJson(data=data.ind, xmetadata=map)
eig.json <- c(1:ncp)
for (i in 1:ncp) {
eig.json[i] <- paste('"PC',
i,
'":"',
pca.output$eig[i, 2],
'"',
sep='')
}
if (biplot == TRUE) {
data.var <- as.data.frame(t(as.data.frame(pca.output$var$coord)))
row.names(data.var) <- gsub('Dim.', 'PC', row.names(data.var))
data.json <- paste('{"ind":',
data.json,
',"var":',
Dataframe2DataJson(data=data.var),
',"eig":{',
paste(eig.json, collapse=','),
'}}',
sep='')
} else {
data.json <- paste('{"data":',
data.json,
',"eig":{',
paste(eig.json, collapse=','),
'}}',
sep='')
}
PrintMsg('"description":"This is a principal component analysis (PCA) (R package {FactoMineR})."',
verbose)
return (data.json)
}
################
# Correspondence Analysis (CA)
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) col (character) Name of a column in map to use to define colors
# (5) biplot (numeric) Show n=biplot factors with biggest norms on the
# biplot
# (6) verbose (character) Print messages? see PrintMsg() options
# (7) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
PerformCA <- function(table=NULL, map=NULL,
verbose=TRUE, graphical=TRUE) {
suppressMessages(library('FactoMineR'))
ncp <- min(max(2, ncol(table)), 5)
ca.output <- CA(table, ncp=ncp, graph=FALSE)
if (graphical == TRUE) {
plot(ca.output, new.plot=FALSE)
}
# Pack data into a json string
data.ind <- as.data.frame(t(as.data.frame(ca.output$col$coord)))
data.json <- Dataframe2DataJson(data=data.ind, xmetadata=map)
eig.json <- c(1:ncp)
for (i in 1:ncp) {
eig.json[i] <- paste('"Dim',
i,
'":"',
ca.output$eig[i, 2],
'"',
sep='')
}
data.var <- t(as.data.frame(ca.output$row$coord))
data.json <- paste('{"ind":',
data.json,
',"var":',
Dataframe2DataJson(data=data.var),
',"eig":{',
paste(eig.json, collapse=','),
'}}',
sep='')
PrintMsg('"description":"This is a correspondence analysis (CA) (R package {FactoMineR})."',
verbose)
return (data.json)
}
################
# Principal Coordinates Analysis (PCoA)
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) fun (character) The distance function to use
# (4) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (5) col (character) Name of a column in map to use to define colors
# (6) json (character) Json string specifying packages
# (7) verbose (character) Print messages? see PrintMsg() options
# (8) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
PerformPCoA <- function(table=NULL, map=NULL, fun='bray',
json=libJson, verbose=TRUE, graphical=TRUE) {
spec <- as.data.frame(do.call('rbind', json$distance))
spec <- spec[spec$value == fun, ]
package <- ''
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
dist <- as.matrix(ComputeDistance(table=Transpose(table),
fun=fun, json=json, verbose=FALSE))
row.names(dist) <- names(table)
data.json <- PerformPCA(table=dist, map=map, biplot=FALSE,
verbose=FALSE, graphical=graphical)
PrintMsg(paste('"description":"This is a principal coordinate analysis (PCoA) (R package {FactoMineR}) based on ',
spec$label,
package,
'."',
sep=''),
verbose)
return (data.json)
}
################
# Canonical Correlation Analysis (CCA)
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) col (character) Name of a column in map to use to define colors
# (5) biplot (numeric) Show n=biplot factors with biggest norms on the
# biplot
# (6) verbose (character) Print messages? see PrintMsg() options
# (7) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
PerformCCA <- function(table=NULL, map=NULL, column=NULL,
verbose=TRUE, graphical=TRUE) {
suppressMessages(library('vegan'))
table <- as.data.frame(t(table))
names <- unlist(strsplit(column, split=','))
categs <- as.data.frame(map[, names])
names(categs) <- names
row.names(categs) <- row.names(map)
cca.output <- cca(table ~ ., data=categs)
if (graphical == TRUE) {
plot(cca.output)
}
# Pack data into a json string
data.ind <- as.data.frame(t(cca.output$CCA$u))
data.var <- as.data.frame(t(cca.output$CCA$v))
data.biplot <- as.data.frame(t(cca.output$CCA$biplot))
data.json <- paste('{"ind":',
Dataframe2DataJson(data=data.ind, xmetadata=map),
',"var":',
Dataframe2DataJson(data=data.var),
',"biplot":',
Dataframe2DataJson(data=data.biplot),
'}',
sep='')
PrintMsg(paste('"description":"This is a canonical correspondence analysis (CCA) (R package {vegan}) with the following constraining variable(s):',
column,
'."',
sep=''),
verbose)
return (data.json)
}
################
# Compute fold-change
# Args:
# (1) response (numeric vector) Data
# (2) factor (vector) Experimental design vector with
# nrow=length(response)
# (3) by (dataframe) A nested variable to use for calculating
# fold-changes
# (4) balanced (bool) assume that model balanced? If NULL, balance will be tested
# (5) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A data-frame containing fold-changes for each variable in model
# Prints to stdout:
# A description of the function
################
ComputeFoldChange <- function(group1=NULL, group2=NULL, names=NULL, verbose=TRUE) {
out <- list()
out[['logfc']] <- log2(mean(group2))-log2(mean(group1))
out[['ab']] <- mean(c(group2, group1))
PrintMsg(paste('"description":"Fold-change is calculated as log2(mean(',
names[2],
'))-log2(mean(',
names[1],
')) and abundance is calculated as (mean(',
names[1],
')+mean(',
names[2],
'))/2."',
sep=''),
verbose)
return(out)
}
################
# Extract group of samples from map
# Args:
# (1) map (data-frame) Experimental design table with nrow=ncol(table)
# Returns:
# Prints to stdout:
################
ExtractGroupFromMap <- function(map=NULL, column=NULL, group=NULL, verbose) {
map <- as.data.frame(FilterMap(map=map, model=column))
group <- unlist(strsplit(group, split=':'))
if (ncol(map) != length(column)) {
PrintMsg('"error":"Unable to determine groups for fold-change analysis. Make sure no - or : character is present in the mapping file."'
, verbose)
}
keep <- 1
for (i in 1:ncol(map)) {
keep <- keep*(map[, i] == group[i])
}
return(keep)
}
################
# Analyse changes (fold-changes, p-values etc.)
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) json (character) Json string specifying packages
# (5) verbose (character) Print messages? see PrintMsg() options
# Returns:
# Figure data as a json string
# Prints to stdout:
# A description of the function
################
AnalyseChange <- function(table=NULL, map=NULL,
stats=NULL, model=NULL,
json=libJson, verbose=TRUE,
graphical=FALSE) {
PrintMsg('"description":"This is a differential analysis (fold-changes and statistical tests)."',
verbose)
# Calculate statistics
if (is.null(model)) {
model <- paste(names(map), sep='+')
}
nmodel <- length(model)
data.json <- c(1:nmodel)
for (i in 1:nmodel) {
if (length(stats) != length(model)) {
method = stats[1]
} else {
method = stats[i]
}
nrow <- nrow(table)
effect.json <- list()
v <- verbose
graphic.data <- list()
for (j in 1:nrow) {
stat <- ComputeStats(response=as.vector(unlist(table[j, ])), map=map,
method=method, model=model[i], pairwise=TRUE,
verbose=v)$pairwise
effect.names <- names(stat)
neffect <- length(effect.names)
for (k in 1:neffect) {
if (j == 1 && k == 1) {
graphic.data[[effect.names[k]]] <- list()
}
p <- unlist(stat[[k]][, ncol(stat[[k]])])
if (length(effect.json) == 0) {
effect.json[[effect.names[k]]] <- list()
}
comp.names <- row.names(stat[[k]])
ncomp <- length(comp.names)
for (l in 1:ncomp) {
if (j == 1 && l == 1) {
graphic.data[[effect.names[k]]][[comp.names[l]]] <- data.frame(logfc=c(1:nrow), ab=c(1:nrow))
row.names(graphic.data[[effect.names[k]]][[comp.names[l]]]) <- row.names(table)
}
if (length(effect.json[[effect.names[k]]]) == 0) {
effect.json[[effect.names[k]]][[comp.names[l]]] <- list()
effect.json[[effect.names[k]]][[comp.names[l]]] <- c(1:nrow)
}
group.names <- unlist(strsplit(row.names(stat[[k]])[l], split='-'))
if (length(group.names) != 2) {
PrintMsg('"error":"Unable to determine groups for fold-change analysis. Make sure no - or : character is present in the mapping file."', verbose)
}
group1 <- ExtractGroupFromMap(map=map, column=names(stat)[k], group=group.names[1], verbose)
group2 <- ExtractGroupFromMap(map=map, column=names(stat)[k], group=group.names[2], verbose)
fc <- ComputeFoldChange(as.vector(unlist(table[j, group1 == 1])),
as.vector(unlist(table[j, group2 == 1])),
names=group.names,
verbose=v)
effect.json[[effect.names[k]]][[comp.names[l]]][j] <- paste('{"mean abundance":"',
fc$ab,
'","log2(fold-change)":"',
fc$logfc,
'","-log2(p-value)":"',
-log2(p[l]),
'"}',
sep='')
graphic.data[[effect.names[k]]][[comp.names[l]]]$logfc[j] <- fc$logfc
graphic.data[[effect.names[k]]][[comp.names[l]]]$ab[j] <- fc$ab
}
}
v <- FALSE
}
for (k in 1:length(effect.json)) {
for (l in 1:length(effect.json[[k]])) {
effect.json[[k]][[l]] <- paste('"',
names(effect.json[[k]])[l],
'":[',
paste(effect.json[[k]][[l]], collapse=','),
']',
sep='')
if (graphical == TRUE) {
plot(graphic.data[[k]][[l]]$ab,
graphic.data[[k]][[l]]$logfc,
main=names(effect.json[[k]])[l],
xlab='mean abundance',
ylab='log2(fold-change)')
}
}
effect.json[[k]] <- paste('"',
names(effect.json)[k],
'":{',
paste(effect.json[[k]], collapse=','),
'}',
sep='')
}
data.json[i] <- paste('"',
model[i],
'":{',
paste(effect.json, collapse=','),
'}',
sep='')
}
return(paste('{"data":{',
paste(data.json, collapse=','),
'},"names":["',
paste(row.names(table), collapse='","'),
'"]}',
sep=''))
}
################
# Build a heatmap of abundances
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character vector) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) metadata (numeric dataframe) Metadata table
# (5) fun (character) Function to use in hierarchical clustering
# (6) json (character) Json string specifying packages
# (7) verbose (character) Print messages? see PrintMsg() options
# (8) graphical (logical) Generate graphics?
# Returns:
# Figure data as a json string
# Output to device:
# A figure
# Prints to stdout:
# A description of the function
################
BuildHeatMap <- function(table=NULL, map=NULL, stats='anova',
model=NULL, secondary=NULL,
fun='spearman',
json=libJson, verbose=TRUE, graphical=TRUE) {
suppressMessages(library('gplots'))
PrintMsg('"description":"This is a heat-map of abundances."',
verbose=verbose)
# Check that there is no constant data
RemoveConstantRows(table=table, verbose=TRUE)
if ((nrow(table) < 3) || (ncol(table) < 3)) {
PrintMsg('"error":"Not enough data to draw a heatmap (min 3 rows and 3 columns)."', TRUE, TRUE)
}
# Center/scale by row
data <- as.data.frame(Transpose(apply(table, 1, cScale)))
names(data) <- names(table)
row.names(data) <- row.names(table)
PrintMsg('"description":"Abundances are scaled on standard deviation and centered on mean per row."',
verbose=verbose)
# keep only columns of map that are used in the model
map <- FilterMap(map=map, model=model)
# Set function for hierarchical clustering
DistFun <- function(x) {
return (ComputeDistance(table=x,
fun=fun,
json=json,
verbose=FALSE))
}
if (graphical == FALSE) {
pdf(file = NULL)
}
# Set the color palette
col <- colorRampPalette(c("navy", "seashell", "red2"))(n=51)
heatmap.out <- heatmap.2(as.matrix(data),
scale='row',
na.rm=TRUE,
Rowv=TRUE,
Colv=TRUE,
dendrogram='both',
#distfun=DistFun,
col=col,
symm=FALSE,
symkey=FALSE,
key=TRUE,
keysize=1.5,
density.info='density',
trace='none',
labCol=names(data),
labRow=row.names(data),
cexCol=0.2+1/log10(ncol(data)),
mar=c(5, 10))
if (graphical == FALSE) {
dev.off()
}
# Order data according to dendrogram
data <- data[heatmap.out$rowInd, ]
data <- data[, heatmap.out$colInd]
names <- names(map)
map <- as.data.frame(map[heatmap.out$colInd, ])
names(map) <- names
if (!is.null(secondary)) {
secondary <- secondary[, heatmap.out$colInd]
}
# Store heatmap data into a json string
ncol <- ncol(data)
heatmap.json <- c(1:ncol)
data.names <- names(data)
for (i in 1:ncol) {
heatmap.json[i] <- paste('{"name":"',
data.names[i],
'","value":["',
paste(data[, i], collapse='","'),
'"]}',
sep='')
}
# Add categories to topbar
topbar.json <- list()
ncateg <- ncol(map)
category.json <- c(1:ncateg)
categ.names <- names(map)
for (i in 1:ncateg) {
category.json[i] <- paste('{"name":"',
categ.names[i],
'","value":["',
paste(map[, i], collapse='","'),
'"]}',
sep='')
}
topbar.json[['category']] <- paste('"category":[',
paste(category.json, collapse=','),
']',
sep='')
# Add p-values to sidebar
sidebar.json <- list()
if (is.null(model)) {
model <- paste(names(map), sep='+')
}
nmodel <- length(model)
stat.json <- c(1:nmodel)
for (i in 1:nmodel) {
if (length(stats) != length(model)) {
method = stats[1]
} else {
method = stats[i]
}
nrow <- nrow(data)
v <- verbose
p.value <- list()
for (j in 1:nrow) {
stat <- ComputeStats(response=as.vector(unlist(data[j, ])), map=map,
method=method, model=model[i], pairwise=FALSE,
verbose=v)$summary
stat.rownames <- row.names(stat)[row.names(stat) != 'Residuals']
stat <- as.data.frame(stat[row.names(stat) != 'Residuals', ])
nstat <- nrow(stat)
for (k in 1:nstat) {
if (is.null(p.value[[stat.rownames[k]]])) {
p.value[[stat.rownames[k]]] <- c(1:nrow)
}
p.value[[stat.rownames[k]]][j] <- stat[k, ]
}
v <- FALSE
}
for (j in 1:length(p.value)) {
p.value[[j]] <- paste('{"name":"',
names(p.value)[j],
'","value":["',
paste(p.value[[j]], collapse='","'),
'"]}',
sep='')
}
stat.json[i] <- paste('"',
model[i],
'":[',
paste(p.value, collapse=','),
']',
sep='')
}
sidebar.json[['p-values']] <- paste('"p-values":{',
paste(stat.json, collapse=','),
'}',
sep='')
# Add correlations with secondary dataset to sidebar
if (! is.null(secondary)) {
PrintMsg('"description":"The correlations between the primary and secondary datasets were calculated using the Spearman rank correlation."',
verbose=verbose)
correlation <- as.data.frame(cor(x=t(data), y=t(secondary), use='na.or.complete', method ="spearman"))
ncor <- ncol(correlation)
cor.json <- c(1:ncor)
for (i in 1:ncor) {
cor.json[i] <- paste('{"name":"',
names(correlation)[i],
'","value":["',
paste(correlation[, i], collapse='","'),
'"]}',
sep='')
}
sidebar.json[['correlations']] <- paste('"correlations":[',
paste(cor.json, collapse=','),
']',
sep='')
}
data.json <- paste('{"heatmap":[',
paste(heatmap.json, collapse=','),
'],"colnames":["',
paste(names(data), collapse='","'),
'"],"rownames":["',
paste(row.names(data), collapse='","'),
'"],"topbar":{',
paste(topbar.json, collapse=','),
'},"sidebar":{',
paste(sidebar.json, collapse=','),
'},"colDendrogram":',
dendrogram2json(heatmap.out$colDendrogram)[[1]],
',"rowDendrogram":',
dendrogram2json(heatmap.out$rowDendrogram)[[1]],
'}',
sep='')
return (data.json)
}
################
# Get the groups with the highest mean
# Args:
# (1) data (numeric vector) The data
# (2) map (data-frame) Experimental design table with nrow=length(data)
# (3) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) verbose (character) Print messages? see PrintMsg() options
# Returns:
# A character vector with length=numer of terms in model
# Prints to stdout:
# A description of the function
################
getGroupWithHighestMean <- function(data=NULL, map=NULL, column=NULL, verbose=TRUE) {
PrintMsg('"description":"Determine groups with highest mean abundance."',
verbose=verbose)
map <- as.data.frame(map[, unlist(strsplit(column, split=':'))])
if (ncol(map) == 1) {
names(map) <- column
}
group <- as.data.frame(unique(map))
if (ncol(group) == 1) {
names(group) <- column
}
mean <- c(1:nrow(group))
for (i in 1:nrow(group)) {
keep <- 1
for (j in 1:ncol(map)) {
keep <- keep*(map[, j] %in% group[i, names(map)[j]])
}
mean[i] <- mean(unlist(data[as.logical(keep)]))
}
out <- list()
out$value <- max(mean)
out$id <- paste(unlist(group[which(mean == max(mean)), ]), collapse=':')
out$group <- apply(group, 1, paste, collapse=':')
return (out)
}
################
# Build a network of correlations for variables
# Args:
# (1) table (numeric dataframe) Count table
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) metadata (numeric dataframe) Metadata table
# (5) fun (character) Distance, correlation or similarity function
# (6) json (character) Json string specifying packages
# (7) verbose (character) Print messages? see PrintMsg() options
# Returns:
# Figure data as a json string
# Prints to stdout:
# A description of the function
################
BuildCorrelationNetwork <- function(table=NULL, map=NULL, stats='anova',
model=NULL, secondary=NULL, fun='spearman',
json=libJson, verbose=TRUE) {
spec <- as.data.frame(do.call('rbind', json$correlation))
spec <- spec[spec$value == fun, ]
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
package <- ''
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"This is a correlation network. It was built using ',
spec$label, package, '."', sep=''),
verbose=verbose)
suppressMessages(library(igraph))
# keep only columns of map that are used in the model
map <- FilterMap(map=map, model=model)
# Combine with secondary dataset
data <- rbind(table, secondary)
data.type <- rep('primary dataset', nrow(data))
if (!is.null(secondary)) {
data.type[(nrow(table)+1):nrow(data)] <- "secondary dataset"
}
# Check that there is no constant data
RemoveConstantRows(table=data, verbose=TRUE)
if ((nrow(data) < 3) || (ncol(data) < 3)) {
PrintMsg('"error":"Not enough data to compute a network (min 3 rows and 3 columns)."', TRUE, TRUE)
}
# Calculate the correlation matrix
weight <- ComputeCorrelation(table=data, fun=fun, test=TRUE,
json=json, verbose=FALSE)
nmodel <- length(model)
nnodes <- nrow(data)
node.json <- c(1:nnodes)
link.json <- c(1:(nnodes*(nnodes/2-1)))
l <- 1
legend.json <- list()
for (i in 1:nnodes) {
model.json <- c(1:nmodel)
for (j in 1:nmodel) {
+ if (length(stats) != length(model)) {
+ method = stats[1]
+ } else {
+ method = stats[i]
+ }
stat <- ComputeStats(response=as.vector(unlist(data[i, ])), map=map,
- method=stats, model=model[j], pairwise=FALSE,
+ method=method, model=model[j], pairwise=FALSE,
verbose=FALSE)$summary
neffect <- length(row.names(stat)[row.names(stat) != 'Residuals'])
effect.json <- c(1:neffect)
if (length(legend.json[[model[j]]]) == 0) {
legend.json[[model[j]]] <- c(1:neffect)
}
for (k in 1:neffect) {
gwhm <- getGroupWithHighestMean(data=data[i, ], map=map, column=row.names(stat)[k], verbose=FALSE)
effect.json[k] <- paste('"',
row.names(stat)[k],
'":{"p-value":"',
stat[k, ],
'","highest-mean":"',
gwhm$id,
'"}',
sep='')
if (i == 1) {
legend.json[[model[j]]][k] <- paste('"',
row.names(stat)[k],
'":["',
paste(gwhm$group, collapse='","'),
'"]',
sep='')
}
}
model.json[j] <- paste('"',
model[j],
'":{',
paste(effect.json, collapse=','),
'}',
sep='')
if (i == 1) {
legend.json[[model[j]]] <- paste('"',
model[j],
'":{',
paste(legend.json[[model[j]]], collapse=','),
'}',
sep='')
}
}
node.json[i] <- paste('{"id":"',
i-1,
'","name":"',
row.names(data)[i],
'","data-type":"',
data.type[i],
'","stat":{',
paste(model.json, collapse=','),
'},"mean":"',
mean(unlist(data[i, ])),
'","min":"',
min(unlist(data[i, ])),
'","max":"',
max(unlist(data[i, ])),
'"}',
sep='')
if (i < nnodes) {
for (j in (i+1):nnodes) {
link.json[l] <- paste('{"source":"',
i-1,
'","target":"',
j-1,
'","weight":"',
weight$estimate[i, j],
'","p-value":"',
weight$p.value[i, j],
'"}',
sep='')
l <- l+1
}
}
}
return (paste('{"nodes":[',
paste(node.json, collapse=','),
'],"links":[',
paste(link.json, collapse=','),
'],"legend":{',
paste(unlist(legend.json), collapse=','),
'}}',
sep=''))
}
################
# Build a network of similarity for samples
# Args:
# (1) tables (list of numeric dataframes)
# (2) map (data-frame) Experimental design table with nrow=ncol(table)
# (3) model (character) A formula representing the model for stats. Terms
# must refer to names(map)
# (4) funs (list of characters) Distance or similarity functions
# (5) clust (character) Network clustering algorithm
# (6) json (character) Json string specifying packages
# (7) verbose (character) Print messages? see PrintMsg() options
# (8) lib (character) Path to holynet library
# Returns:
# List [1] json string (fusion network) [2] data-frame (clusters)
# Prints to stdout:
# A description of the function
################
BuildSimilarityNetwork <- function(table=NULL, map=NULL, funs=NULL,
clust="walktrap", clust.names=NULL,
json=libJson, verbose=TRUE,
lib='holynetlib.R') {
source(lib)
spec <- as.data.frame(do.call('rbind', json$distance))
for (i in 1:length(table)) {
fun <- funs[1]
if ((length(funs) == length(table)) && (length(funs) > 1)) {
fun <- funs[i]
}
spec <- spec[spec$value == fun, ]
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
package <- ''
if (length(pkg) > 0) {
for (j in 1:length(pkg)) {
suppressMessages(library(pkg[j], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"This is a similarity network. It was built using ',
spec$label, package, '."', sep=''),
verbose=verbose)
}
# Calculate similarity matrices
mat <- list()
for (i in 1:length(table)) {
fun <- funs[1]
if (length(funs) == length(table)) {
fun <- funs[i]
}
mat[[i]] <- CreateAdjacencyMatrix(CreateSimilarityMatrix(table[[i]], fun))
}
# If multiple matrices exist, fuse them
if (length(mat) > 1) {
fusmat <- FuseMatrices(mat)
if (is.null(fusmat)) {
PrintMsg('"error":"Trying to fuse matrices but dimensions are not matching."', TRUE, TRUE)
}
mat[[(length(mat)+1)]] <- fusmat
}
out <- list()
# Compute clustering on similarity matrices
if (!is.null(clust) && (clust != 'none')) {
spec <- as.data.frame(do.call('rbind', json$graph_clustering))
spec <- spec[spec$value == clust, ]
pkg <- unlist(strsplit(as.character(unlist(spec$pkg)), split=','))
package <- ''
if (length(pkg) > 0) {
for (i in 1:length(pkg)) {
suppressMessages(library(pkg[i], character.only=TRUE))
}
package <- paste(' (R package {', unlist(spec$pkg), '})', sep='')
}
PrintMsg(paste('"description":"Clusters were determined using the ',
spec$label, package, '."', sep=''),
verbose=verbose)
clusters <- matrix(ncol=length(mat), nrow=nrow(mat[[1]]))
for (i in 1:length(mat)) {
clusters[, i] <- paste(clust,
as.character(ApplyNetworkClustering(CreateNetwork(mat[[i]]), clust_method=clust)),
sep='')
}
clusters <- as.data.frame(clusters)
if (!is.null(clust.names)) {
names(clusters) <- clust.names
}
out[['txt']] <- clusters
map <- cbind(map, clusters)
}
# Pack data into a json string
json.data <- buildFusionNetworkJson(mat, map)
out[['json']] <- json.data
return (out)
}
#################################
# (\/)(°,,,°)(\/) #
#################################
libJson <- ReadJson()
diff --git a/lib/genocrunch_console/lib/genocrunchlib.py b/lib/genocrunch_console/lib/genocrunchlib.py
index ed9dd68..a5395d0 100755
--- a/lib/genocrunch_console/lib/genocrunchlib.py
+++ b/lib/genocrunch_console/lib/genocrunchlib.py
@@ -1,1008 +1,1018 @@
# -*- coding: utf-8 -*-
#genocrunchlib.py
import os, sys, argparse, tarfile
from json import load, loads, dump
from subprocess import Popen, PIPE
from shutil import copyfile, move, rmtree
from itertools import izip
from collections import OrderedDict
from tempfile import mkdtemp, NamedTemporaryFile
from random import choice
from string import ascii_uppercase, digits
from copy import copy
from datetime import datetime
__dir__ = os.path.dirname(__file__)
main_dir = '/'.join(__dir__.split('/')[0:len(__dir__.split('/'))-1])
def Error(f=__file__, msg='', to_rm=[]):
print 'Error ['+ os.path.basename(f) +']: '+ msg
if len(to_rm) > 0:
for t in to_rm:
rmtree(t)
raise SystemExit(1)
def Warning(f=__file__, msg=''):
print 'Warning ['+ os.path.basename(f) +']: '+ msg
def mkdir(dp=''):
"""Create a directory if not already present
Args:
dp Directory path
"""
if not os.path.exists(dp):
os.makedirs(dp)
return 0
def rand(n=1):
"""Generate a random string
n Length of the random string
"""
return ''.join(choice(ascii_uppercase + digits) for _ in range(n))
class Parameters(object):
"""Parameters retrieved from genocrunchlib.json, from user's json and passed in command line"""
def __init__(self, json_fp=''):
"""Populate the object
json_fp Json file path
"""
# Set authorized arguments for cli
argument_parser = argparse.ArgumentParser()
argument_parser._action_groups.pop()
required_arguments = argument_parser.add_argument_group('required arguments')
optional_arguments = argument_parser.add_argument_group('optional arguments')
optional_arguments.add_argument('--params_fp',
help='Path to a JSON file containing parameters.',
nargs=1,
type=str)
optional_arguments.add_argument('--analysis',
help='List of analysis to perform (comma-separated). Valid choices: proportions,diversity,adonis,pca,pcoa,heatmap,change,correlation_network,clustering,similarity_network',
nargs=1,
default='proportions,pcoa',
type=str)
required_arguments.add_argument('--output',
help='Path to output directory.',
nargs=1,
type=str)
# Define parameters based on the json
self.json_fp = json_fp
with open(self.json_fp, 'r') as f:
self.json_s = load(f)
self.fps = []
required = []
for key, value in self.json_s.get('fields').iteritems():
for val in value:
if 'scope' not in val.keys() or val.get('scope') != 'form_only':
if 'default' in val.keys():
default = val.get('default')
else:
default=None
# Add authorized cli arguments from the json
if 'optional' in val.keys() and val.get('optional') == False:
required.append(val.get('id'))
required_arguments.add_argument('--'+val.get('id'),
help=val.get('help'),
nargs=1,
default=default,
type=str)
else:
optional_arguments.add_argument('--'+val.get('id'),
help=val.get('help'),
nargs=1,
default=default,
type=str)
if val.get('type') == 'file':
self.fps.append(val.get('id'))
# Retrieve cli arguments
self.params = argument_parser.parse_args().__dict__
# Set params values from user's json if provided (does not overwrite parameters passed in command line)
# Careful with booleans as form checkboxes return 1 or nothing...
for key, value in self.params.iteritems():
if isinstance(value, list):
self.params[key] = value[0]
if self.params['params_fp'] is not None and os.path.exists(self.params['params_fp']):
with open(self.params.get('params_fp'), 'r') as f:
params = load(f)
for key, value in params.items():
if '--'+key not in sys.argv[1:]:
if value in ['', [], ['']]:
self.params[key] = None
else:
self.params[key] = value
# Handle booleans
for e in ['prim_rarefaction', 'sec_rarefaction']:
if e not in params.keys():
self.params[e] = False
elif params[e] in ['', ' ', '0']:
self.params[e] = False
else:
self.params[e] = bool(params[e])
if '--analysis' in sys.argv[1:]:
for a in self.params['analysis'].split(','):
self.params[a] = 1
self.params.pop('analysis')
for key, value in self.params.iteritems():
if value is not None:
if isinstance(value, str) and len(value.split(',')) > 1:
self.params[key] = value.split(',')
elif key in required:
Error(msg='argument --'+key+' is requireds')
# Check that all files exist
missing = []
for fp in self.fps:
if self.params[fp] is not None and not os.path.exists(self.params[fp]):
missing.append('--'+fp+' '+self.params[fp])
if len(missing) > 0:
Error(msg='The following file(s) were not found: '+' ,'.join(missing))
def write(self, fp=''):
"""Write all parameters to a file
fp File path
"""
with open(fp, 'w') as f:
dump(self.params, f, indent=2)
def table2R(fp):
"""Adapt file format for easy loading in R:
Remove comments from the top of the file (also empty first element of first line)
"""
sub = Popen([main_dir+'/bin/modify_table.sh',
fp,
'table2R',
fp])
sub.wait()
def stdoutLog2json(stdout):
"""Convert stdout to json string"""
return loads('[{'+'"},{"'.join(stdout.read().split('""'))+'}]')
class Log(object):
"""A log file"""
def __init__(self, fp):
self.fp = fp
self.data = []
self.update()
def update(self):
"""Write log data to file"""
with open(self.fp, 'w') as f:
dump(self.data, f, indent=2)
self.secure(self.fp+'.safe')
def secure(self, fp=None):
"""Remove absolute paths from the log"""
if fp is None:
fp = self.fp
path_to_hide = os.path.dirname(self.fp)
f = open(self.fp, 'r')
content = f.read()
f.close()
new_content = content.replace(path_to_hide,'')
f = open(fp, 'w')
f.write(new_content)
f.close()
def wrapup(self):
"""Update pending and runing status to failed and update"""
f = open(self.fp, 'r')
content = f.read()
f.close()
new_content = content.replace('running','failed').replace('pending','failed')
f = open(self.fp, 'w')
f.write(new_content)
f.close()
self.secure(self.fp+'.safe')
class Archive(object):
"""An archive"""
def __init__(self, target_fp='', name=None):
self.fp = target_fp
self.source = []
self.add(name)
def add(self, name=None, update=False, generate=True):
"""Add files to the archive"""
if name is not None:
if isinstance(name, list):
self.source = list(set(self.source + name))
else:
self.source = list(set(self.source + [name]))
if update and generate:
self.update()
elif generate:
with tarfile.open(self.fp, "w:gz") as archive:
archive.add(name)
def update(self):
"""Update files in archive"""
if os.path.exists(self.fp):
tmp = NamedTemporaryFile(delete=False, dir=os.path.dirname(self.fp))
os.rename(self.fp, tmp.name)
with tarfile.open(self.fp, "w:gz") as archive:
for e in self.source:
archive.add(e)
class Map(object):
"""Mapping file"""
def __init__(self, fp, output):
self.fp = output+'/map.txt'
copyfile(fp, self.fp)
table2R(self.fp)
self.log = []
self.log.append({'name':'file', 'type':'file', 'path':self.fp})
self.log.append({'name':'validation', 'type':'validation', 'status':'pending', 'messages':[]})
def validate(self):
log = [e for e in self.log if e['name'] == 'validation'][0]
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
sub = Popen([main_dir+'/bin/validate_format.R',
'-m',
'validate_map',
'--map',
self.fp],
stdout=PIPE,
stderr=PIPE)
sub.wait()
log['status'] = 'completed'
log['messages'] = stdoutLog2json(sub.stdout)
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
Error(msg='Error detected in '+self.fp)
log['status'] = 'completed'
log['execution_time'] = str(datetime.now() - start_time)
class DataSet(object):
"""Dataset"""
_pipeline = [{'name':'pre-processing', 'fun':'preProcess', 'operations':[
'filtering',
'binning'
]},
{'name':'transformation', 'fun':'transform', 'operations':[
'rarefaction',
'transformation',
'batch_effect_suppression'
]}
]
def __init__(self,
json_fp = '',
pipeline = [],
data_fp = '',
output = '',
map = '',
category_column = '',
abundance_threshold = '',
abundance_threshold_type = '',
presence_threshold = '',
presence_threshold_type = '',
bin_levels = [],
bin_fun = '',
rarefy = False,
rarefaction_depth = '',
nsampling = '',
transformation = '',
batch_effect = '',
batch_effect_fun = ''):
self.json_fp = json_fp
self.pipeline = self._pipeline
if pipeline != []:
self.pipeline = pipeline
self.output = output
self.data_fp = [self.output+'/'+os.path.basename(data_fp)]
self.tmpdir = os.path.normpath(self.output+'/../'+mkdtemp())
mkdir(self.tmpdir)
copyfile(data_fp, self.data_fp[0])
table2R(self.data_fp[0])
self.map = map
self.category_column = category_column
self.abundance_threshold = abundance_threshold
self.abundance_threshold_type = abundance_threshold_type
self.presence_threshold = presence_threshold
self.presence_threshold_type = presence_threshold_type
self.bin_levels = bin_levels
self.bin_fun = bin_fun
self.rarefy = rarefy
self.rarefaction_depth = rarefaction_depth
self.nsampling = nsampling
self.transformation = transformation
self.batch_effect = batch_effect
self.batch_effect_fun = batch_effect_fun
self.log = []
self.log.append({'name':'file', 'type':'file', 'path':self.data_fp[0]})
self.log.append({'name':'validation', 'type':'validation', 'status':'pending', 'messages':[]})
self.log.append({'name':'sorting', 'type':'operation', 'status':'pending', 'messages':[]})
for p in self.pipeline:
op = []
for e in p['operations']:
op.append({'name':e, 'status':'pending', 'messages':[]})
self.log.append({'name':p['name'], 'type':'step', 'operations':op})
op = []
self.stdout = []
self.stderr = []
def validate(self):
log = [e for e in self.log if e['name'] == 'validation'][0]
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
for fp in self.data_fp:
sub = Popen([main_dir+'/bin/validate_format.R',
'-t',
fp,
'-m',
'validate_dataset',
'--map',
self.map.fp,
'--category_column',
str(self.category_column)],
stdout=PIPE,
stderr=PIPE)
sub.wait()
log['messages'].extend(stdoutLog2json(sub.stdout))
log['status'] = 'completed'
log['execution_time'] = str(datetime.now() - start_time)
def sort(self):
log = [e for e in self.log if e['name'] == 'sorting'][0]
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
tmp = NamedTemporaryFile(delete=False, dir=self.tmpdir)
for fp in self.data_fp:
sub = Popen([main_dir+'/bin/modify_table.R',
'-t',
fp,
'-m',
'sorting',
'--map',
self.map.fp,
'--ignore',
str(self.category_column),
'-o',
fp,
'--log',
str(tmp.name)],
stdout=PIPE,
stderr=PIPE)
sub.wait()
self.appendStdout(sub.stdout.read())
self.appendStderr(sub.stderr.read())
if os.path.exists(tmp.name):
with open(tmp.name, 'r') as t:
log['messages'].extend(stdoutLog2json(t))
os.remove(tmp.name)
tmp.close()
log['status'] = 'completed'
log['execution_time'] = str(datetime.now() - start_time)
def preProcess(self, fun=''):
log = [e for e in [o for o in self.log if o['name'] == 'pre-processing'][0]['operations'] if e['name'] == fun][0]
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
tmp = NamedTemporaryFile(delete=False, dir=self.tmpdir)
args = [main_dir+'/bin/modify_table.R',
'-m',
fun,
'--log',
str(tmp.name)]
if fun == 'filtering':
if (self.abundance_threshold is None or self.abundance_threshold == 0) and (self.presence_threshold is None or self.presence_threshold == 0):
log['status'] = 'skipped'
return(0)
args.extend(['--column',
str(self.category_column),
'--abundance_threshold',
self.abundance_threshold,
'--abundance_threshold_type',
self.abundance_threshold_type,
'--presence_threshold',
self.presence_threshold,
'--presence_threshold_type',
self.presence_threshold_type])
elif fun == 'binning':
if self.bin_levels is None:
log['status'] = 'skipped'
return(0)
args.extend(['--column',
str(self.category_column),
'--fun',
self.bin_fun])
output_fps = []
for fp in self.data_fp:
args.extend(['-t', fp])
if fun == 'filtering':
output_fp = fp +'_filtered.txt'
output_fps.append(output_fp)
args.extend(['-o', output_fp])
sub = Popen(args, stdout=PIPE, stderr=PIPE)
sub.wait()
self.appendStdout(sub.stdout.read())
self.appendStderr(sub.stderr.read())
elif fun == 'binning':
vstyle = 2 if len(self.bin_levels) > 1 else 1
for level in self.bin_levels:
output_fp = fp+'_lvl'+level+'.txt'
output_fps.append(output_fp)
args.extend(['--level', level,
'--vstyle', str(vstyle),
'-o', output_fp])
sub = Popen(args, stdout=PIPE, stderr=PIPE)
sub.wait()
self.appendStdout(sub.stdout.read())
self.appendStderr(sub.stderr.read())
vstyle = 3
if os.path.exists(tmp.name):
with open(tmp.name, 'r') as t:
log['messages'].extend(stdoutLog2json(t))
os.remove(tmp.name)
tmp.close()
log['status'] = 'completed'
log['execution_time'] = str(datetime.now() - start_time)
self.data_fp = output_fps
def transform(self, fun=''):
log = [e for e in [o for o in self.log if o['name'] == 'transformation'][0]['operations'] if e['name'] == fun][0]
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
tmp = NamedTemporaryFile(delete=False, dir=self.tmpdir)
args = [main_dir+'/bin/modify_table.R',
'--log',
str(tmp.name),
'--ignore',
str(self.category_column)]
if fun == 'rarefaction':
if self.rarefy is None or self.rarefy == False:
log['status'] = 'skipped'
return(0)
args.extend(['-m',
'rarefaction',
'--sample',
self.rarefaction_depth,
'--nsampling',
self.nsampling])
elif fun == 'transformation':
if self.transformation is None or self.transformation in ['none', '', ' ']:
log['status'] = 'skipped'
return(0)
args.extend(['-m', self.transformation])
elif fun == 'batch_effect_suppression':
if self.batch_effect is None:
log['status'] = 'skipped'
return(0)
args.extend(['-m',
'batch_effect_suppression',
'--map',
self.map.fp,
'--effect',
self.batch_effect,
'--fun',
self.batch_effect_fun])
suffix = {'rarefaction':'rar',
'transformation':'trans',
'batch_effect_suppression':'bes'}
output_fps = []
vstyle = 1
for fp in self.data_fp:
output_fp = ('.').join(fp.split('.')[:-1])+'_'+suffix[fun]+'.txt'
output_fps.append(output_fp)
args.extend(['-t', fp,
'--vstyle', str(vstyle),
'-o', output_fp])
sub = Popen(args, stdout=PIPE, stderr=PIPE)
sub.wait()
self.appendStdout(sub.stdout.read())
self.appendStderr(sub.stderr.read())
vstyle = 2
if os.path.exists(tmp.name):
with open(tmp.name, 'r') as t:
log['messages'].extend(stdoutLog2json(t))
os.remove(tmp.name)
tmp.close()
log['status'] = 'completed'
log['execution_time'] = str(datetime.now() - start_time)
self.data_fp = output_fps
def appendStdout(self, stdout):
if stdout is not None and stdout != '':
self.stdout.append(stdout)
def appendStderr(self, stderr):
if stderr is not None and stderr != '':
self.stderr.append(stderr)
class Analysis(object):
"""THE ANALYSIS"""
def __init__(self, json_fp):
"""initialize the analysis pipeline
json_fp File path to json
"""
# INITIALIZE THE PIPELINE
self.pipeline = [{'name':'diversity', 'before_step':'transformation', 'status':'pending', 'messages':[]},
{'name':'clustering', 'before_step':'analysis', 'status':'pending', 'messages':[]},
{'name':'proportions', 'status':'pending', 'messages':[]},
{'name':'adonis', 'status':'pending', 'messages':[]},
{'name':'pca', 'status':'pending', 'messages':[]},
{'name':'ca', 'status':'pending', 'messages':[]},
{'name':'pcoa', 'status':'pending', 'messages':[]},
{'name':'cca', 'status':'pending', 'messages':[]},
{'name':'heatmap', 'status':'pending', 'messages':[]},
{'name':'change', 'status':'pending', 'messages':[]},
{'name':'correlation_network', 'status':'pending', 'messages':[]},
{'name':'similarity_network', 'status':'pending', 'messages':[]}]
# Set parameters from json
self.parameters = Parameters(json_fp)
# Make the output directory
self.output = self.parameters.params['output']
if os.path.exists(self.output):
rmtree(self.output)
mkdir(self.output)
self.tmpdir = os.path.normpath(self.output+'/../'+mkdtemp())
mkdir(self.tmpdir)
self.to_rm = [self.tmpdir]
# Set the archive
#self.archive = Archive(self.output+'/output.tar.gz')
# Store parameters to output directory
self.parameters.write(self.output+'/params.json')
# Start logging
self.log = Log(self.output+'/log.json')
self.stdout = Log(self.output+'/stdout.log')
self.stderr = Log(self.output+'/stderr.log')
def run(self):
"""Run the analysis"""
# Create data sets
self.map = Map(self.parameters.params['map'],
self.output)
self.log.data.append({'name':'map', 'type':'map', 'log':self.map.log})
self.log.update()
#self.archive.add(self.log.fp+'.safe', update=False, generate=False)
log = [e for e in self.map.log if e['name'] == 'validation'][0]
log['status'] = 'running'
self.log.update()
self.map.validate()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='Error detected in '+self.map.fp, to_rm=self.to_rm)
self.log.update()
if self.parameters.params['bin_levels'] is None or isinstance(self.parameters.params['bin_levels'], list):
bin_levels = self.parameters.params['bin_levels']
else:
bin_levels = self.parameters.params['bin_levels'].split(',')
self.primary_dataset = DataSet(json_fp = self.parameters.json_fp,
data_fp = self.parameters.params['primary_dataset'],
output = self.output,
map = self.map,
category_column = self.parameters.params['category_column'],
abundance_threshold = self.parameters.params['abundance_threshold'],
abundance_threshold_type = self.parameters.params['abundance_threshold_type'],
presence_threshold = self.parameters.params['presence_threshold'],
presence_threshold_type = self.parameters.params['presence_threshold_type'],
bin_levels = bin_levels,
bin_fun = self.parameters.params['bin_fun'],
rarefy = bool(self.parameters.params['prim_rarefaction']),
rarefaction_depth = self.parameters.params['prim_sampling_depth'],
nsampling = self.parameters.params['prim_nsampling'],
transformation = self.parameters.params['prim_trans_method'],
batch_effect = self.parameters.params['prim_batch_effect_suppression'],
batch_effect_fun = self.parameters.params['prim_batch_effect_suppression_fun'])
self.log.data.append({'name':'primary_dataset', 'type':'dataset', 'log':self.primary_dataset.log})
self.log.update()
self.stdout.data.append({'primary_dataset':self.primary_dataset.stdout})
self.stderr.data.append({'primary_dataset':self.primary_dataset.stderr})
self.to_rm.append(self.primary_dataset.tmpdir)
#self.archive.add(self.primary_dataset.data_fp, update=False, generate=False)
log = [e for e in self.primary_dataset.log if e['name'] == 'validation'][0]
log['status'] = 'running'
self.log.update()
self.primary_dataset.validate()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='Error detected in primary_dataset.validate()', to_rm=self.to_rm)
self.log.update()
log = [e for e in self.primary_dataset.log if e['name'] == 'sorting'][0]
log['status'] = 'running'
self.log.update()
self.primary_dataset.sort()
self.stdout.update()
self.stderr.update()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='See'+self.log.fp+'.', to_rm=self.to_rm)
self.log.update()
if self.parameters.params['secondary_dataset'] is not None:
self.secondary_dataset = DataSet(json_fp = self.parameters.json_fp,
pipeline = [{'name':'transformation',
'fun':'transform',
'operations':['rarefaction',
'transformation',
'batch_effect_suppression']}],
data_fp = self.parameters.params['secondary_dataset'],
output = self.output,
map = self.map,
category_column = '',
rarefy = bool(self.parameters.params['sec_rarefaction']),
rarefaction_depth = self.parameters.params['sec_sampling_depth'],
nsampling = self.parameters.params['sec_nsampling'],
transformation = self.parameters.params['sec_trans_method'],
batch_effect = self.parameters.params['sec_batch_effect_suppression'],
batch_effect_fun = self.parameters.params['prim_batch_effect_suppression_fun'])
self.log.data.append({'name':'secondary_dataset', 'type':'dataset', 'log':self.secondary_dataset.log})
self.log.update()
self.stdout.data.append({'secondary_dataset':self.secondary_dataset.stdout})
self.stderr.data.append({'secondary_dataset':self.secondary_dataset.stderr})
self.to_rm.append(self.secondary_dataset.tmpdir)
log = [e for e in self.secondary_dataset.log if e['name'] == 'validation'][0]
log['status'] = 'running'
self.log.update()
self.secondary_dataset.validate()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='Error detected in secondary_dataset.validate()', to_rm=self.to_rm)
self.log.update()
log = [e for e in self.secondary_dataset.log if e['name'] == 'sorting'][0]
log['status'] = 'running'
self.log.update()
self.secondary_dataset.sort()
self.stdout.update()
self.stderr.update()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='See'+self.log.fp+'.', to_rm=self.to_rm)
self.log.update()
# RUN DATASET PIPELINE
self.log.data.append({'name':'analysis', 'type':'analysis', 'log':self.pipeline})
self.stdout.data.append({'analysis':[]})
self.stderr.data.append({'analysis':[]})
datasets = [self.primary_dataset]
if self.parameters.params['secondary_dataset'] is not None:
datasets.append(self.secondary_dataset)
for dataset in datasets:
for step in dataset.pipeline:
if dataset == datasets[0]:
analyses = [a for a in [e for e in self.pipeline if 'before_step' in e.keys()] if a['before_step'] == step['name']]
for analysis in analyses:
eval('self.analysis("'+analysis['name']+'")')
for operation in step['operations']:
log = [o for o in [ s for s in dataset.log if s['name'] == step['name']][0]['operations'] if o['name'] == operation][0]
log['status'] = 'running'
self.log.update()
eval('dataset.'+step['fun']+'("'+operation+'")')
self.stdout.update()
self.stderr.update()
if len([e for e in log['messages'] if 'error' in e.keys()]) > 0:
log['status'] = 'failed'
self.log.update()
Error(msg='See'+self.log.fp+'.', to_rm=self.to_rm)
self.log.update()
# COMPLETE ANALYSIS PIPELINE
analyses = [a for a in [e for e in self.pipeline if 'before_step' in e.keys()] if a['before_step'] == 'analysis']
for analysis in analyses:
eval('self.analysis("'+analysis['name']+'")')
analyses = [a for a in [e for e in self.pipeline if 'before_step' not in e.keys()]]
for analysis in analyses:
eval('self.analysis("'+analysis['name']+'")')
self.cleanup()
def cleanup(self):
self.log.wrapup()
self.stdout.update()
self.stderr.update()
#self.archive.update()
for t in self.to_rm:
rmtree(t)
def analysis(self, method=''):
"""Run R scripts to generate figures and stats"""
log = [e for e in [l for l in self.log.data if l['name'] == 'analysis'][0]['log'] if e['name'] == method][0]
stdout = [ e for e in self.stdout.data if e.keys()[0] == 'analysis'][0]['analysis']
stderr = [ e for e in self.stderr.data if e.keys()[0] == 'analysis'][0]['analysis']
if method not in self.parameters.params.keys() or self.parameters.params[method] == 0:
log['status'] = 'skipped'
self.log.update()
return(0)
log['status'] = 'running'
start_time = datetime.now()
log['start_time'] = str(start_time)
self.log.update()
append_to_map = False
output = self.output+'/'+method
mkdir(output)
tmp = NamedTemporaryFile(delete=False, dir=self.tmpdir)
# Set general arguments
args = [main_dir+'/bin/analyse_table.R',
'--graphical',
'TRUE',
'--log',
str(tmp.name),
'--method',
method,
'--map',
self.map.fp,
'--category',
self.primary_dataset.category_column]
# Set analysis-specific arguments
if method == 'diversity':
if self.parameters.params['diversity_metric'] is not None:
args.extend(['--fun',
','.join(self.parameters.params['diversity_metric'])])
if self.parameters.params['compare_diversity'] is not None:
args.extend(['--compare_diversity',
self.parameters.params['compare_diversity']])
elif method == 'clustering':
append_to_map = True
if self.parameters.params['clustering_fun'] is not None:
clustering_fun = self.parameters.params['clustering_fun']
if isinstance(clustering_fun, list): # NOT SUPPORTED BY R SCRIPT YET
clustering_fun = ','.join(clustering_fun)
args.extend(['--fun',
clustering_fun])
elif method == 'adonis':
if self.parameters.params['adonis_model'] is not None:
adonis_model = self.parameters.params['adonis_model']
if isinstance(adonis_model, list):
adonis_model = ','.join(adonis_model)
args.extend(['--adonis_model',
adonis_model])
if self.parameters.params['adonis_distfun'] is not None:
adonis_distfun = self.parameters.params['adonis_distfun']
if isinstance(adonis_distfun, list):
adonis_distfun = ','.join(adonis_distfun)
args.extend(['--fun',
adonis_distfun])
if self.parameters.params['adonis_strata'] is not None:
adonis_strata = self.parameters.params['adonis_strata']
if isinstance(adonis_strata, list):
adonis_strata = ','.join(adonis_strata)
args.extend(['--strata',
adonis_strata])
elif method == 'pcoa':
pcoa_distfun = self.parameters.params['pcoa_distfun']
if pcoa_distfun is not None:
if isinstance(pcoa_distfun, list):
pcoa_distfun = ','.join(pcoa_distfun)
args.extend(['--fun',
str(pcoa_distfun)])
elif method == 'cca':
if self.parameters.params['cca_categ'] is not None:
cca_categ = self.parameters.params['cca_categ']
if isinstance(cca_categ, list):
cca_categ = ','.join(cca_categ)
args.extend(['--column',
cca_categ])
else:
log['messages'].append({'warning':'No categorical data was given in CCA. Please provide the name(s) of column(s) in map that contain(s) categorical data to be used in CCA.'})
log['status'] = 'skipped'
self.log.update()
return(0)
elif method == 'heatmap' and self.parameters.params['secondary_dataset'] is not None:
args.extend(['--secondary',
self.secondary_dataset.data_fp[0]])
elif method == 'correlation_network':
if self.parameters.params['correlation_network_fun'] is not None:
correlation_network_fun = self.parameters.params['correlation_network_fun']
if isinstance(correlation_network_fun, list):
correlation_network_fun = ','.join(correlation_network_fun)
args.extend(['--fun',
correlation_network_fun])
if self.parameters.params['secondary_dataset'] is not None:
args.extend(['--secondary',
self.secondary_dataset.data_fp[0]])
elif method == 'similarity_network':
append_to_map = True
if self.parameters.params['similarity_network_fun1'] is not None:
fun = self.parameters.params['similarity_network_fun1']
if self.parameters.params['secondary_dataset'] is not None and self.parameters.params['similarity_network_fun2'] is not None:
fun = fun+','+self.parameters.params['similarity_network_fun2']
args.extend(['--fun', fun])
if self.parameters.params['similarity_network_clust'] is not None:
args.extend(['--clust',
self.parameters.params['similarity_network_clust']])
if self.parameters.params['secondary_dataset'] is not None:
args.extend(['--secondary',
self.secondary_dataset.data_fp[0]])
if len(self.primary_dataset.data_fp) > 1:
if len(self.primary_dataset.data_fp) != len(self.parameters.params['bin_levels']):
Error(msg='Analysis cannot be performed at specified binning levels because corresponding data files were not found.',
to_rm=self.to_rm)
log['levels'] = log.pop('messages')
for i in range(0, len(self.parameters.params['bin_levels'])):
log['levels'].append({'level':self.parameters.params['bin_levels'][i], 'status':'pending', 'messages':[]})
for i in range(0, len(self.primary_dataset.data_fp)):
l = log
output_fp = output+'/'+method
if len(self.primary_dataset.data_fp) > 1:
l = [e for e in log['levels'] if e['level'] == self.parameters.params['bin_levels'][i]][0]
output_fp = output_fp+'_level_'+self.parameters.params['bin_levels'][i]
l['status'] = 'running'
args_cp = copy(args)
args_cp.extend(['-t',
self.primary_dataset.data_fp[i],
'-o',
output_fp])
if self.parameters.params['model_type'] == 'basic':
if isinstance(self.parameters.params['basic_model'], list):
m = ','.join(self.parameters.params['basic_model'])
else:
m = self.parameters.params['basic_model']
args_cp.extend(['--model',
m])
else:
if isinstance(self.parameters.params['advanced_model'], list):
m = ','.join(self.parameters.params['advanced_model'])
else:
m = self.parameters.params['advanced_model']
+
+ if isinstance(self.parameters.params['advanced_stats'], list):
+ s = ','.join(self.parameters.params['advanced_stats'])
+ else:
+ s = self.parameters.params['advanced_stats']
+
args_cp.extend(['--stats',
- self.parameters.params['advanced_stats'],
+ s,
'--model',
m])
sub = Popen(args_cp, stdout=PIPE, stderr=PIPE)
sub.wait()
s = sub.stdout.read()
+ #print s
if s is not None and s != '':
stdout.append(s)
s = sub.stderr.read()
if s is not None and s != '':
stderr.append(s)
if os.path.exists(tmp.name):
with open(tmp.name, 'r') as t:
l['messages'].extend(stdoutLog2json(t))
os.remove(tmp.name)
tmp.close()
self.log.update()
if append_to_map:
if os.path.exists(output_fp+'.txt') and os.stat(output_fp+'.txt').st_size != 1:
tmpmap = NamedTemporaryFile(delete=False, dir=self.tmpdir)
with open(tmpmap.name, 'w') as new, open(self.map.fp) as m, open(output_fp+'.txt') as c:
i = 0
for line_m, line_c in zip(m, c):
to_add = line_c.rstrip().split('\t')
to_add.pop(0)
if i == 0:
while to_add in line_m.rstrip().split('\t'):
to_add = to_add+'_'+rand(3)
model_to_add = to_add
new.write('%s\t%s\n' % (line_m.rstrip(), '\t'.join(to_add)))
i = i+1
move(tmpmap.name, self.map.fp)
if self.parameters.params['model_type'] == 'basic':
if not isinstance(self.parameters.params['basic_model'], list):
self.parameters.params['basic_model'] = [self.parameters.params['basic_model']]
self.parameters.params['basic_model'].extend(model_to_add)
else:
- if not isinstance(self.parameters.params['basic_model'], list):
+ if not isinstance(self.parameters.params['advanced_model'], list):
self.parameters.params['advanced_model'] = [self.parameters.params['advanced_model']]
self.parameters.params['advanced_model'].extend(model_to_add)
+ if not isinstance(self.parameters.params['advanced_stats'], list):
+ self.parameters.params['advanced_stats'] = [self.parameters.params['advanced_stats']]
+ self.parameters.params['advanced_stats'].extend(['anova'])
else:
Warning(msg='In '+method+': output file not found ('+output_fp+'.txt'+').')
if len([e for e in l['messages'] if 'error' in e.keys()]) > 0:
l['status'] = 'failed'
log['execution_time'] = str(datetime.now() - start_time)
log['status'] = 'failed'
self.log.update()
Error(msg='See'+self.log.fp+'.', to_rm=self.to_rm)
else:
l['status'] = 'completed'
self.log.update()
log['execution_time'] = str(datetime.now() - start_time)
log['status'] = 'completed'
self.log.update()
diff --git a/lib/genocrunch_console/lib/genocrunchlib.pyc b/lib/genocrunch_console/lib/genocrunchlib.pyc
index a38c3c0..06ea672 100644
Binary files a/lib/genocrunch_console/lib/genocrunchlib.pyc and b/lib/genocrunch_console/lib/genocrunchlib.pyc differ