From bef72f985be8b3b53736fff1fa8bf063eb1f131a Mon Sep 17 00:00:00 2001 From: Franziska Niemeyer <franziska.niemeyer@uni-bielefeld.de> Date: Mon, 8 Aug 2022 09:27:28 +0200 Subject: [PATCH] Upload solutions for exercises_C --- .../Python_course_2021_exercises_C.ipynb | 338 ++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 Exercises/solutions/Python_course_2021_exercises_C.ipynb diff --git a/Exercises/solutions/Python_course_2021_exercises_C.ipynb b/Exercises/solutions/Python_course_2021_exercises_C.ipynb new file mode 100644 index 0000000..0ac0325 --- /dev/null +++ b/Exercises/solutions/Python_course_2021_exercises_C.ipynb @@ -0,0 +1,338 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Python_course_2021_exercises_C.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "FyvebZ68I8BJ" + }, + "source": [ + "# Python course 2021 - Exercises C" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8PgJ1ymVJCIO" + }, + "source": [ + "## Part1 - file handling" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ws7tJiEXJG8f" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.1) Count number of sequences (number of headers) in AtCol0_Exons.fasta!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "JzgmMxR0JVxL", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "52d24ee3-445c-4b78-9b91-52de52a9791d" + }, + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oEJidIAEy8S5" + }, + "source": [ + "datei = open(\"/content/drive/MyDrive/PythonProgramming/AtCol0_Exons.fasta\", \"r\")\n", + "lines = datei.readlines()\n", + "datei.close()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def get_num_headers(lines):\n", + " num_headers = 0\n", + " for line in lines:\n", + " if line:\n", + " if line[0] == \">\":\n", + " num_headers += 1\n", + " return num_headers\n", + "\n", + "print(get_num_headers(lines))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "q8s7_9qRxa_b", + "outputId": "ad4a42f6-4f24-42bd-8508-0d5e92d59347" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "217183\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rFtHoz5UKujx" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.2) Count number of sequence lines!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AgMttuZlKyBg", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "88d9aa00-aa17-4778-e291-6d70cebfa67b" + }, + "source": [ + "def get_num_sequence_lines(lines):\n", + " num_sequence_lines = 0\n", + " for line in lines:\n", + " if line:\n", + " if line[0] != \">\":\n", + " num_sequence_lines += 1\n", + " return num_sequence_lines\n", + "\n", + "print(get_num_sequence_lines(lines))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "916024\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YTH3rkjJKyNm" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.3) Count number of characters in document! (How many per line?)\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6ECkHsa9K3-X", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "fe3ccf0a-8afa-4b7f-ba19-e3870430989f" + }, + "source": [ + "def get_num_characters(lines):\n", + " num_characters = 0\n", + " num_lines = 0\n", + " for line in lines:\n", + " num_characters += len(line)\n", + " num_lines += 1\n", + " return (num_characters, num_characters / num_lines)\n", + "\n", + "print(get_num_characters(lines))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(81803755, 72.18783064347467)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I9bkusUsK4HV" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.4) How long are all contained sequences combined?\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XC4que0hK81W", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e77692ed-7a33-47ef-d8ee-5ac9550535fe" + }, + "source": [ + "def get_sequence_length(lines):\n", + " total_sequence_length = 0\n", + " for line in lines:\n", + " if line:\n", + " if line[0] != \">\":\n", + " line = line.strip()\n", + " total_sequence_length += len(line)\n", + " return total_sequence_length\n", + "\n", + "print(get_sequence_length(lines))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "64867051\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5NnxagAWK9AP" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.5) Calculate the average sequence length in this file!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MZNV3sNqLB62", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "01b885d6-dc65-4b5d-f893-288015678122" + }, + "source": [ + "def get_average_sequence_length(lines):\n", + " return get_sequence_length(lines) / get_num_headers(lines)\n", + "\n", + "print(get_average_sequence_length(lines))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "298.67462462531597\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Additional exercises**" + ], + "metadata": { + "id": "n9rZsJ5_4hTJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "1.6) Parse the fasta file entry-wise. An entry consists of a header and the corresponding sequence (which may comprise multiple lines). The result should be a list of tuples of the form (header, sequence)." + ], + "metadata": { + "id": "ItrnPkVE5fsv" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + "Parse a fasta file entry-wise as a list of tuples of the form (header, sequence).\n", + "\"\"\"\n", + "def read_fasta(file):\n", + " result = []\n", + "\n", + " header = None\n", + " sequence = []\n", + " for line in file:\n", + " # remove all whitespace from the ends\n", + " line = line.strip()\n", + " if line.startswith('>'):\n", + " # if you find a header return the previous FASTA block in tuple form after\n", + " # concatenating the sequence lines(if there is a previous block)\n", + " if header:\n", + " result += [(header, ''.join(sequence))]\n", + "\n", + " header = line\n", + " sequence = []\n", + " else:\n", + " # current line is not a header\n", + " # add line to the list of sequence lines of the current FASTA block after removing all whitespace from it\n", + " sequence.append(line.translate(str.maketrans('', '', whitespace)))\n", + " \n", + " if header:\n", + " result += [(header, ''.join(sequence))]\n", + " return result" + ], + "metadata": { + "id": "RvF09FlO6YeT" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file -- GitLab