From 92ea647621113207e225d92eab0c231b86b6443f Mon Sep 17 00:00:00 2001 From: Franziska Niemeyer <franziska.niemeyer@uni-bielefeld.de> Date: Mon, 8 Aug 2022 09:28:14 +0200 Subject: [PATCH] Upload solutions for exercises_F --- .../Python_course_2021_exercises_F.ipynb | 417 ++++++++++++++++++ 1 file changed, 417 insertions(+) create mode 100644 Exercises/solutions/Python_course_2021_exercises_F.ipynb diff --git a/Exercises/solutions/Python_course_2021_exercises_F.ipynb b/Exercises/solutions/Python_course_2021_exercises_F.ipynb new file mode 100644 index 0000000..c58df78 --- /dev/null +++ b/Exercises/solutions/Python_course_2021_exercises_F.ipynb @@ -0,0 +1,417 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Python_course_2021_exercises_F.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "C6hjQZrrO1lx", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Python course 2021 - Exercises F" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RoMsf4tCO3CA", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Part1 - DNA, RNA and peptide sequences" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3_wMYLkSPBrx", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "---\n", + "1.1) Write a function to get the reverse complement (upper case letters) of a DNA sequence given in upper case letters!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Uq70MpOBPMpe", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "012a7aed-2e4e-4ac4-d32c-2b1b4e8c3784", + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "def get_reverse_complement(sequence):\n", + " bases = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}\n", + "\n", + " rev_comp = []\n", + " for i in range(len(sequence)-1, -1, -1):\n", + " rev_comp += [bases[sequence[i]]]\n", + "\n", + " return ''.join(rev_comp)\n", + "\n", + "sequence_1 = \"AGACGTA\"\n", + "print(sequence_1)\n", + "print(get_reverse_complement(sequence_1))\n", + "\n", + "sequence_2 = \"TTTGACGTAT\"\n", + "print(sequence_2)\n", + "print(get_reverse_complement(sequence_2))" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "AGACGTA\n", + "TACGTCT\n", + "TTTGACGTAT\n", + "ATACGTCAAA\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "trElcFlcPMzg", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "---\n", + "1.2) Write a function to convert a DNA sequence into a RNA sequence!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ahMEGCWTPUAY", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4f6e4e43-6b56-460b-fac9-4407444fd236", + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "def convert_DNA_to_RNA(sequence):\n", + " rna_sequence = []\n", + " for character in sequence:\n", + " if character == \"T\":\n", + " rna_sequence += [\"U\"]\n", + " else:\n", + " rna_sequence += [character]\n", + "\n", + " return ''.join(rna_sequence)\n", + "\n", + "print(sequence_1)\n", + "print(convert_DNA_to_RNA(sequence_1))\n", + "\n", + "print(sequence_2)\n", + "print(convert_DNA_to_RNA(sequence_2))" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "AGACGTA\n", + "AGACGUA\n", + "TTTGACGTAT\n", + "UUUGACGUAU\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EOMgpzlnPUJ6", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "---\n", + "1.3) Write a function to translate a DNA sequence into amino acids (first frame only)!\n", + "* Tip: [wiki - codon tables](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aFVhE4KEPrM4", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d64ea5ed-0a79-4d75-95ae-4841ce40a6a3", + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "codons = {\n", + " \"TTT\": \"F\", \"TTC\": \"F\",\n", + " \"TTA\": \"L\", \"TTG\": \"L\", \"CTT\": \"L\", \"CTC\": \"L\", \"CTA\": \"L\", \"CTG\": \"L\",\n", + " \"ATT\": \"I\", \"ATC\": \"I\", \"ATA\": \"I\",\n", + " \"ATG\": \"M\",\n", + " \"GTT\": \"V\", \"GTC\": \"V\", \"GTA\": \"V\", \"GTG\": \"V\",\n", + " \"TCT\": \"S\", \"TCC\": \"S\", \"TCA\": \"S\", \"TCG\": \"S\",\n", + " \"CCT\": \"P\", \"CCC\": \"P\", \"CCA\": \"P\", \"CCG\": \"P\",\n", + " \"ACT\": \"T\", \"ACC\": \"T\", \"ACA\": \"T\", \"ACG\": \"T\",\n", + " \"GCT\": \"A\",\"GCC\": \"A\", \"GCA\": \"A\", \"GCG\": \"A\",\n", + " \"TAT\": \"Y\", \"TAC\": \"Y\",\n", + " \"CAT\": \"H\", \"CAC\": \"H\",\n", + " \"CAA\": \"Q\", \"CAG\": \"Q\",\n", + " \"AAT\": \"N\", \"AAC\": \"N\",\n", + " \"AAA\": \"K\", \"AAG\": \"K\",\n", + " \"GAT\": \"D\", \"GAC\": \"D\",\n", + " \"GAA\": \"E\",\"GAG\": \"E\",\n", + " \"TGT\": \"C\", \"TGC\": \"C\",\n", + " \"TGG\": \"W\",\n", + " \"CGT\": \"R\", \"CGC\": \"R\", \"CGA\": \"R\", \"CGG\": \"R\",\n", + " \"AGT\": \"S\", \"AGC\": \"S\",\n", + " \"AGA\": \"R\", \"AGG\": \"R\",\n", + " \"GGT\": \"G\", \"GGC\": \"G\", \"GGA\": \"G\", \"GGG\": \"G\",\n", + " \"TGA\": \"*\", \"TAA\": \"*\", \"TAG\": \"*\",\n", + "}\n", + "\n", + "def translate(sequence):\n", + " peptide_sequence = []\n", + " for i in range(0, len(sequence) - 2, 3):\n", + " peptide_sequence += [codons[sequence[i: i+3]]]\n", + " return ''.join(peptide_sequence)\n", + "\n", + "sequence = \"ATGCATGGTTGAGGCGGCATGCGTCGCGATTGG\"\n", + "print(translate(sequence))" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "MHG*GGMRRDW\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3-KHtLeePsQ4", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "---\n", + "1.4) Write a function to translate DNA sequences in all 6 frames into peptide sequences! The longest peptide sequence per DNA sequence should be returned!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "s50AlgALP8I3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "436613c5-0516-4f89-dc88-00427ecab66c", + "pycharm": { + "name": "#%%\n" + } + }, + "source": [ + "\"\"\"\n", + "Translates a DNA sequence in all 6 frames into peptide sequences\n", + "and yields the peptide sequences.\n", + "\"\"\"\n", + "def translate_all_frames(sequence):\n", + " for i in range(3):\n", + " yield translate(sequence[i:])\n", + " \n", + " rev_comp = get_reverse_complement(sequence)\n", + " for i in range(3):\n", + " yield translate(rev_comp[i:])\n", + "\n", + "\"\"\"\n", + "Find a longest valid peptide sequence, meaning one that starts with\n", + "M, in a peptide sequence.\n", + "\"\"\"\n", + "def get_longest_peptide_sequence(sequence):\n", + " longest_length = 0\n", + " longest_peptide_sequence = \"\"\n", + " \n", + " current_sequence = []\n", + " currently_in_sequence = False\n", + " for peptide in sequence:\n", + " if currently_in_sequence:\n", + " # encounter a stop codon\n", + " if peptide == \"*\":\n", + " if current_sequence:\n", + " # update longest observed sequence and length if necessary\n", + " if len(current_sequence) > longest_length:\n", + " longest_length = len(current_sequence)\n", + " longest_peptide_sequence = ''.join(current_sequence)\n", + " # clear current sequence\n", + " current_sequence = []\n", + " currently_in_sequence = False\n", + " else:\n", + " # sequence extend continues\n", + " current_sequence += [peptide]\n", + " # currently not in a valid peptide sequence\n", + " else:\n", + " # encounter a start peptide\n", + " if peptide == \"M\":\n", + " current_sequence += [peptide]\n", + " currently_in_sequence = True\n", + " # if we are not in a valid peptide sequence and the current peptide is not\n", + " # a start peptide, simply continue\n", + "\n", + " if current_sequence:\n", + " if len(current_sequence) > longest_length:\n", + " longest_length = len(current_sequence)\n", + " longest_peptide_sequence = ''.join(current_sequence)\n", + "\n", + " return longest_peptide_sequence\n", + "\n", + "\"\"\"\n", + "Computes a longest valid peptide sequence for each of the 6 frames and yields it.\n", + "\"\"\"\n", + "def longest_peptide_sequence_per_frame(sequence):\n", + " for frame in translate_all_frames(sequence):\n", + " yield get_longest_peptide_sequence(frame)\n", + "\n", + "print(\"Reading frames:\")\n", + "for frame in translate_all_frames(sequence):\n", + " print(frame)\n", + "print(\"\")\n", + "\n", + "print(\"Longest valid peptide sequence:\")\n", + "print(get_longest_peptide_sequence(translate(sequence)))\n", + "print(\"\")\n", + "\n", + "print(\"Longest valid peptide sequence per frame:\")\n", + "for longest_peptide_sequence in longest_peptide_sequence_per_frame(sequence):\n", + " print(longest_peptide_sequence)" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reading frames:\n", + "MHG*GGMRRDW\n", + "CMVEAACVAI\n", + "AWLRRHASRL\n", + "PIATHAASTMH\n", + "QSRRMPPQPC\n", + "NRDACRLNHA\n", + "\n", + "Longest valid peptide sequence:\n", + "MRRDW\n", + "\n", + "Longest valid peptide sequence per frame:\n", + "MRRDW\n", + "MVEAACVAI\n", + "\n", + "MH\n", + "MPPQPC\n", + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "---\n", + "Human Dataset\n", + "\n", + "1.1) Count number of unique genes from the file \"genes_human_genenames_duplicates.txt\" and write the gene names in a new file." + ], + "metadata": { + "id": "ey-lNS6d9XIQ", + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')\n", + "\n", + "with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/genes_human_genenames_duplicates.txt\", 'r') as genes:\n", + " with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/gene_names_unique_human.txt\", 'w') as new_file:\n", + " unique_genes = []\n", + " line = genes.readline()\n", + " while line:\n", + " if '.' in line:\n", + " line = line[:line.find('.')] # dismiss the transcript number\n", + " if not line in unique_genes: # Check if the gene was already encountered and write it in the new file if not\n", + " unique_genes.append(line)\n", + " new_file.write(line)\n", + " line = genes.readline()\n", + " " + ], + "metadata": { + "id": "K_sqSkLx90lo", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "49045a1e-a48e-494e-ba71-82de736c9e84", + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + } + ] +} \ No newline at end of file -- GitLab