diff --git a/Exercises/solutions/Python_course_2021_exercises_D.ipynb b/Exercises/solutions/Python_course_2021_exercises_D.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8f869b6e53effaaa2c06d5aa5f7346eacd7f9cc2 --- /dev/null +++ b/Exercises/solutions/Python_course_2021_exercises_D.ipynb @@ -0,0 +1,301 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Python_course_2021_exercises_D.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "xqfYLmi0LWEl", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Python course 2021 - Exercises D" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LRZcpmP8LaR_", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Part1 - writing files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NDIaKYRcLfz1", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "---\n", + "1.1) Read the file AtCol0_Exons.fasta and write all headers (starting with '>') into a new file!\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/AtCol0_Exons.fasta\", 'r') as exons:\n", + " with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/Header_AtCol0_Exons.txt\", 'w') as new_file:\n", + " line = exons.readline()\n", + " while line:\n", + " if line[0] == '>':\n", + " new_file.write(line)\n", + " line = exons.readline()" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9OMVwvZLzW9A", + "outputId": "36f854a7-f0f2-4472-f3e5-081fad69ebe9" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "1.2) Read the file AtCol0_Exons.fasta and write the following:\n", + "* Line if it is a header\n", + "* Length of line if it is a sequence line\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + }, + "id": "4S2i2BAjzW9B" + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/AtCol0_Exons.fasta\", 'r') as exons:\n", + " with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/Summary_AtCol0_Exons.txt\", 'w') as new_file:\n", + " line = exons.readline()\n", + " while line:\n", + " if line[0] == '>':\n", + " new_file.write(line)\n", + " else:\n", + " new_file.write(str(len(line.strip())) + \"\\n\")\n", + " line = exons.readline()" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "id": "WRHfLJuJzW9C" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "1.3) Calculate the number of sequences, the cumulative length and the average length in a new file! Are they matching the values of the original file?\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + }, + "id": "2Uy9-PtVzW9C" + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of sequences: 217183\n", + "Cumulative length: 64867051 bases\n", + "Average sequence length: 298.67462462531597 bases\n" + ] + } + ], + "source": [ + "def summarize_seq_info(summary_file):\n", + " with open(summary_file, 'r') as summary:\n", + " seq_count = 0\n", + " cum_len = 0\n", + " line = summary.readline()\n", + " while line:\n", + " if line[0] == '>':\n", + " seq_count += 1\n", + " else:\n", + " cum_len += int(line.strip())\n", + " line = summary.readline()\n", + " print(\"Number of sequences:\", seq_count)\n", + " print(\"Cumulative length:\", cum_len, \"bases\")\n", + " print(\"Average sequence length:\", cum_len / seq_count, \"bases\")\n", + "\n", + "summarize_seq_info(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/Summary_AtCol0_Exons.txt\")" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V5s-M0dSzW9D", + "outputId": "212fadb1-dbbc-4929-e53e-d41426830418" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "1.4) Write sequences into a new file if their length is a multiple of 10!\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + }, + "id": "KuS83FpyzW9E" + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [], + "source": [ + "def seq_lens_multiple_of_10(fasta_file):\n", + " with open(fasta_file, 'r') as fasta_input:\n", + " with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/AtCol0_mult10.txt\", 'w') as out:\n", + " cum_len = 0\n", + " sequence = \"\"\n", + " line = fasta_input.readline()\n", + " while line:\n", + " if line[0] == '>': # Reading the next header\n", + " if cum_len % 10 == 0: # Check if the length is a multiple of 10\n", + " out.write(sequence + '\\n')\n", + " cum_len = 0 # Reset the sequence length and the sequence as we are in the next sequence now\n", + " sequence = \"\"\n", + " else:\n", + " sequence += line.strip() # Append the sequence to the current one as long as no other header is in between\n", + " cum_len += len(line.strip()) # Update the cumulative length for this sequence\n", + " line = fasta_input.readline()\n", + "\n", + "seq_lens_multiple_of_10(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/AtCol0_Exons.fasta\")" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "id": "m2FZHcpmzW9G" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Part2 - characters" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + }, + "id": "HAzJAcOAzW9I" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "---\n", + "2.1) Read the file AtCol0_Exons.fasta and write the following:\n", + "* Only Arabidopsis Gene Identifier (e.g. AT1G01010)\n", + "* Gene Identifier, exon name, exon length (tab-delimited)\n", + "\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + }, + "id": "hUOOeTCUzW9I" + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def arabidopsis_only(fasta_file):\n", + " with open(fasta_file, 'r') as summary:\n", + " with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/Arabidopsis_Exons.txt\", 'w') as arabidopsis:\n", + " line = summary.readline()\n", + " while line:\n", + " if line.startswith('>AT'):\n", + " columns = line.split('|')\n", + " gene_identifier = columns[0].strip('>').split('.')[0]\n", + " if gene_identifier in re.findall(\"AT\\dG\\d{5}\", gene_identifier):\n", + " exon_name = columns[1].strip()\n", + " exon_length = columns[3].strip().split(' ')[2]\n", + " arabidopsis.write(gene_identifier + '\\t' + exon_name + '\\t' + exon_length + '\\n')\n", + " line = summary.readline()\n", + "\n", + "arabidopsis_only(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/AtCol0_Exons.fasta\")" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "id": "YStsUutUzW9I" + } + } + ] +} \ No newline at end of file