Commit 2cb8b8a1 authored by lucaparmigiani's avatar lucaparmigiani
Browse files

R script for plotting

parent f7b6eedf
......@@ -8,3 +8,4 @@
**/pangrowth
**/hist.txt
**/pangenome_growth.txt
**/plots
library(ggplot2)
args = commandArgs(trailingOnly=TRUE)
INPUT_PATH = file.path(args[1])
DIROUT = args[2]
#--------------------------------------------------------------------------------
plot_scatter <- function (item, ytitle, filename, heap, alpha, param, plotlog=F) {
param = latex2exp::TeX(param)
if (abs(alpha) <= 1) {
color_heap = "green"
} else {
color_heap = "red"
}
pl = ggplot(data = item, aes(x = x, y = y, color="Items")) +
geom_point()+
geom_line(data=heap,aes(x=x, y=y, color="Heap's law"), alpha=0.5)+
theme_classic() +
scale_color_manual(values=c(color_heap, "#000000"))+
xlab("No. of genomes") +
ylab(ytitle) +
ggtitle(param) +
theme_bw(base_size = 16) +
theme(legend.text = element_text(size=5), plot.title =
element_text(color="black", size=14))
if (plotlog) {
pl = pl + coord_trans(y='log10',x='log10')
}
#plot(pl)
ggsave(filename=file.path(DIROUT, filename), plot=pl)
}
#--------------------------------------------------------------------------------
# Load
tot_item = as.vector(unlist(read.csv(INPUT_PATH, header=F, sep=" ")))
tot_item = tot_item[!is.na(tot_item)]
new_item = vector("numeric", length(tot_item))
new_item[1] = tot_item[1]
for (i in 2:(length(tot_item))) {
new_item[i] = tot_item[i] - tot_item[i-1]
}
new_item[new_item==0]=1
N = length(tot_item)
#--------------------------------------------------------------------------------
# ALPHA
x = 2:N
model = lm(log(new_item[x])~log(x))
#str(summary(model))
#summary(model)
adj.r.sq = summary(model)$adj.r.squared
x_h = seq(2,N,length.out=1000)
y_h = exp(model$coefficients[1])*x_h^(model$coefficients[2])
alpha = abs(model$coefficients[2])
heap = data.frame(x=x_h,y=y_h)
param = paste0("$\\alpha = $", signif(alpha, digits=4))
plot_scatter(data.frame(x=2:N,y=new_item[2:N]), "No. of new items",
paste0("new_items.png"), heap, alpha, param)
plot_scatter(data.frame(x=2:N,y=new_item[2:N]), "No. of new items",
paste0("new_items_log.png"), heap, alpha, param, plotlog=T)
#--------------------------------------------------------------------------------
# GAMMA
x = 1:N
model = lm(log(tot_item[x])~log(x))
#str(summary(model))
adj.r.sq = summary(model)$adj.r.squared
gamma = model$coefficients[2]
x_h = seq(1,N,length.out=1000)
y_h = exp(model$coefficients[1])*x_h^(model$coefficients[2])
heap = data.frame(x=x_h,y=y_h)
param = paste0("$\\gamma = $", signif(gamma, digits=4))
plot_scatter(data.frame(x=1:N,y=tot_item), "No. of distinct items",
paste0("total_items.png"), heap, alpha, param)
plot_scatter(data.frame(x=1:N,y=tot_item), "No. of distinct items",
paste0("total_items_log.png"), heap, alpha, param, plotlog=T)
......@@ -2,7 +2,11 @@
## Install
```bash
### Dependencies
Pangrowth relies on [GMP](https://gmplib.org/).
```console
make
```
......@@ -10,13 +14,20 @@ make
### Produce the histogram from fasta files
```bash
```console
./pangrowth hist -k 17 -t 12 $(ls -d -1 data/fa/*) > hist.txt
```
### Produce pangenome growth path from histogram or pan-matrix
### Produce pangenome growth from histogram (or pan-matrix)
```bash
```console
./pangrowth growth -h -i data/hist_ecoli_48.txt > pangenome_growth.txt
```
### Create a plot of the pangenome growth
Plotting requires R and ggplot2 installed.
```console
Rscript R/plot_pangrowth.R pangenome_growth.txt plots
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment