Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
AGGI Projects
corer
Commits
45c322e2
Commit
45c322e2
authored
Jan 05, 2022
by
Tizian Schulz
Browse files
Merge branch 'dev' into 'master'
Added a preprocessing step for BUSCO analysis See merge request
!5
parents
cdb502d0
9ba9e0a9
Changes
2
Hide whitespace changes
Inline
Side-by-side
experiments/Snakefile
View file @
45c322e2
...
...
@@ -102,10 +102,18 @@ rule all:
"arabidopsis/coreSizes/coreSize_readGraph_k21_m17.txt",
"arabidopsis/coreSizes/coreSize_readGraph_k21_Core_m17_d60_m17.txt"
rule getProteinSeqs:
input:
"arabidopsis/annotations/{acc}.gff"
output:
"arabidopsis/proteinSequences/{acc}.fasta"
shell:
"python3 scripts/extractProteinSeqs.py {input} {output}"
rule runBUSCOanalysis:
input:
brasDB = config['BUSCObrasDB'],
genes = "arabidopsis/
g
enes/{acc}.fasta"
genes = "arabidopsis/
proteinSequ
en
c
es/{acc}.fasta"
output:
"arabidopsis/buscoAnalysis/run_BUSCO_{acc}/full_table_BUSCO_{acc}.tsv"
shell:
...
...
experiments/scripts/extractProteinSeqs.py
0 → 100644
View file @
45c322e2
#!/usr/bin/env python3
import
sys
#This script takes an annotation file of augustus and outputs a multiple fasta file containing amino acid sequences of all annotations
from
os.path
import
basename
from
Bio.SeqRecord
import
SeqRecord
from
Bio
import
SeqIO
from
Bio.Seq
import
Seq
gname
=
""
counter
=
0
acc
=
basename
(
sys
.
argv
[
1
]).
split
(
".gff"
)[
0
]
isProtSeq
=
False
records
=
[]
for
l
in
open
(
sys
.
argv
[
1
],
'r'
):
l
=
l
.
strip
()
if
not
l
.
startswith
(
'#'
):
elems
=
l
.
split
(
'
\t
'
)
if
elems
[
2
]
==
"gene"
:
gname
=
elems
[
0
]
counter
+=
1
if
l
.
startswith
(
"# protein sequence = ["
):
isProtSeq
=
True
seq
=
l
.
split
(
'['
)[
1
].
split
(
']'
)[
0
]
if
l
.
find
(
']'
)
>
-
1
:
records
.
append
(
SeqRecord
(
Seq
(
seq
),
id
=
acc
+
gname
+
'_'
+
str
(
counter
)))
isProtSeq
=
False
continue
if
isProtSeq
:
seq
+=
l
.
split
(
' '
)[
1
].
split
(
']'
)[
0
]
if
l
.
find
(
']'
)
>
-
1
:
records
.
append
(
SeqRecord
(
Seq
(
seq
),
id
=
acc
+
gname
+
'_'
+
str
(
counter
)))
isProtSeq
=
False
SeqIO
.
write
(
records
,
sys
.
argv
[
1
].
split
(
".gff"
)[
0
]
+
".fasta"
,
"fasta"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment