Upload solutions for exercises_F

92ea6476 · Franziska Niemeyer · 946d9cf4 · 92ea6476
Commit 92ea6476 authored 2 years ago by Franziska Niemeyer
--- a/Exercises/solutions/Python_course_2021_exercises_F.ipynb
+++ b/Exercises/solutions/Python_course_2021_exercises_F.ipynb
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "name": "Python_course_2021_exercises_F.ipynb",
+   "provenance": [],
+   "collapsed_sections": []
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "C6hjQZrrO1lx",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Python course 2021 - Exercises F"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RoMsf4tCO3CA",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Part1 - DNA, RNA and peptide sequences"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3_wMYLkSPBrx",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.1) Write a function to get the reverse complement (upper case letters) of a DNA sequence given in upper case letters!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "Uq70MpOBPMpe",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "012a7aed-2e4e-4ac4-d32c-2b1b4e8c3784",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "def get_reverse_complement(sequence):\n",
+    "  bases = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}\n",
+    "\n",
+    "  rev_comp = []\n",
+    "  for i in range(len(sequence)-1, -1, -1):\n",
+    "    rev_comp += [bases[sequence[i]]]\n",
+    "\n",
+    "  return ''.join(rev_comp)\n",
+    "\n",
+    "sequence_1 = \"AGACGTA\"\n",
+    "print(sequence_1)\n",
+    "print(get_reverse_complement(sequence_1))\n",
+    "\n",
+    "sequence_2 = \"TTTGACGTAT\"\n",
+    "print(sequence_2)\n",
+    "print(get_reverse_complement(sequence_2))"
+   ],
+   "execution_count": 1,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "AGACGTA\n",
+      "TACGTCT\n",
+      "TTTGACGTAT\n",
+      "ATACGTCAAA\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "trElcFlcPMzg",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.2) Write a function to convert a DNA sequence into a RNA sequence!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "ahMEGCWTPUAY",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "4f6e4e43-6b56-460b-fac9-4407444fd236",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "def convert_DNA_to_RNA(sequence):\n",
+    "  rna_sequence = []\n",
+    "  for character in sequence:\n",
+    "    if character == \"T\":\n",
+    "      rna_sequence += [\"U\"]\n",
+    "    else:\n",
+    "      rna_sequence += [character]\n",
+    "\n",
+    "  return ''.join(rna_sequence)\n",
+    "\n",
+    "print(sequence_1)\n",
+    "print(convert_DNA_to_RNA(sequence_1))\n",
+    "\n",
+    "print(sequence_2)\n",
+    "print(convert_DNA_to_RNA(sequence_2))"
+   ],
+   "execution_count": 2,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "AGACGTA\n",
+      "AGACGUA\n",
+      "TTTGACGTAT\n",
+      "UUUGACGUAU\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EOMgpzlnPUJ6",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.3) Write a function to translate a DNA sequence into amino acids (first frame only)!\n",
+    "* Tip: [wiki - codon tables](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "aFVhE4KEPrM4",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "d64ea5ed-0a79-4d75-95ae-4841ce40a6a3",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "codons = {\n",
+    "  \"TTT\": \"F\", \"TTC\": \"F\",\n",
+    "  \"TTA\": \"L\", \"TTG\": \"L\", \"CTT\": \"L\", \"CTC\": \"L\", \"CTA\": \"L\", \"CTG\": \"L\",\n",
+    "  \"ATT\": \"I\", \"ATC\": \"I\",  \"ATA\": \"I\",\n",
+    "  \"ATG\": \"M\",\n",
+    "  \"GTT\": \"V\", \"GTC\": \"V\", \"GTA\": \"V\", \"GTG\": \"V\",\n",
+    "  \"TCT\": \"S\", \"TCC\": \"S\", \"TCA\": \"S\", \"TCG\": \"S\",\n",
+    "  \"CCT\": \"P\", \"CCC\": \"P\", \"CCA\": \"P\", \"CCG\": \"P\",\n",
+    "  \"ACT\": \"T\", \"ACC\": \"T\", \"ACA\": \"T\", \"ACG\": \"T\",\n",
+    "  \"GCT\": \"A\",\"GCC\": \"A\", \"GCA\": \"A\", \"GCG\": \"A\",\n",
+    "  \"TAT\": \"Y\", \"TAC\": \"Y\",\n",
+    "  \"CAT\": \"H\", \"CAC\": \"H\",\n",
+    "  \"CAA\": \"Q\", \"CAG\": \"Q\",\n",
+    "  \"AAT\": \"N\", \"AAC\": \"N\",\n",
+    "  \"AAA\": \"K\", \"AAG\": \"K\",\n",
+    "  \"GAT\": \"D\", \"GAC\": \"D\",\n",
+    "  \"GAA\": \"E\",\"GAG\": \"E\",\n",
+    "  \"TGT\": \"C\", \"TGC\": \"C\",\n",
+    "  \"TGG\": \"W\",\n",
+    "  \"CGT\": \"R\", \"CGC\": \"R\", \"CGA\": \"R\", \"CGG\": \"R\",\n",
+    "  \"AGT\": \"S\", \"AGC\": \"S\",\n",
+    "  \"AGA\": \"R\", \"AGG\": \"R\",\n",
+    "  \"GGT\": \"G\", \"GGC\": \"G\", \"GGA\": \"G\", \"GGG\": \"G\",\n",
+    "  \"TGA\": \"*\", \"TAA\": \"*\", \"TAG\": \"*\",\n",
+    "}\n",
+    "\n",
+    "def translate(sequence):\n",
+    "  peptide_sequence = []\n",
+    "  for i in range(0, len(sequence) - 2, 3):\n",
+    "    peptide_sequence += [codons[sequence[i: i+3]]]\n",
+    "  return ''.join(peptide_sequence)\n",
+    "\n",
+    "sequence = \"ATGCATGGTTGAGGCGGCATGCGTCGCGATTGG\"\n",
+    "print(translate(sequence))"
+   ],
+   "execution_count": 3,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "MHG*GGMRRDW\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3-KHtLeePsQ4",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.4) Write a function to translate DNA sequences in all 6 frames into peptide sequences! The longest peptide sequence per DNA sequence should be returned!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "s50AlgALP8I3",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "436613c5-0516-4f89-dc88-00427ecab66c",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "\"\"\"\n",
+    "Translates a DNA sequence in all 6 frames into peptide sequences\n",
+    "and yields the peptide sequences.\n",
+    "\"\"\"\n",
+    "def translate_all_frames(sequence):\n",
+    "  for i in range(3):\n",
+    "    yield translate(sequence[i:])\n",
+    "  \n",
+    "  rev_comp = get_reverse_complement(sequence)\n",
+    "  for i in range(3):\n",
+    "    yield translate(rev_comp[i:])\n",
+    "\n",
+    "\"\"\"\n",
+    "Find a longest valid peptide sequence, meaning one that starts with\n",
+    "M, in a peptide sequence.\n",
+    "\"\"\"\n",
+    "def get_longest_peptide_sequence(sequence):\n",
+    "  longest_length = 0\n",
+    "  longest_peptide_sequence = \"\"\n",
+    "  \n",
+    "  current_sequence = []\n",
+    "  currently_in_sequence = False\n",
+    "  for peptide in sequence:\n",
+    "    if currently_in_sequence:\n",
+    "      # encounter a stop codon\n",
+    "      if peptide == \"*\":\n",
+    "        if current_sequence:\n",
+    "          # update longest observed sequence and length if necessary\n",
+    "          if len(current_sequence) > longest_length:\n",
+    "            longest_length = len(current_sequence)\n",
+    "            longest_peptide_sequence = ''.join(current_sequence)\n",
+    "        # clear current sequence\n",
+    "        current_sequence = []\n",
+    "        currently_in_sequence = False\n",
+    "      else:\n",
+    "        # sequence extend continues\n",
+    "        current_sequence += [peptide]\n",
+    "    # currently not in a valid peptide sequence\n",
+    "    else:\n",
+    "      # encounter a start peptide\n",
+    "      if peptide == \"M\":\n",
+    "        current_sequence += [peptide]\n",
+    "        currently_in_sequence = True\n",
+    "      # if we are not in a valid peptide sequence and the current peptide is not\n",
+    "      # a start peptide, simply continue\n",
+    "\n",
+    "  if current_sequence:\n",
+    "    if len(current_sequence) > longest_length:\n",
+    "      longest_length = len(current_sequence)\n",
+    "      longest_peptide_sequence = ''.join(current_sequence)\n",
+    "\n",
+    "  return longest_peptide_sequence\n",
+    "\n",
+    "\"\"\"\n",
+    "Computes a longest valid peptide sequence for each of the 6 frames and yields it.\n",
+    "\"\"\"\n",
+    "def longest_peptide_sequence_per_frame(sequence):\n",
+    "  for frame in translate_all_frames(sequence):\n",
+    "    yield get_longest_peptide_sequence(frame)\n",
+    "\n",
+    "print(\"Reading frames:\")\n",
+    "for frame in translate_all_frames(sequence):\n",
+    "  print(frame)\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Longest valid peptide sequence:\")\n",
+    "print(get_longest_peptide_sequence(translate(sequence)))\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Longest valid peptide sequence per frame:\")\n",
+    "for longest_peptide_sequence in longest_peptide_sequence_per_frame(sequence):\n",
+    "  print(longest_peptide_sequence)"
+   ],
+   "execution_count": 4,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Reading frames:\n",
+      "MHG*GGMRRDW\n",
+      "CMVEAACVAI\n",
+      "AWLRRHASRL\n",
+      "PIATHAASTMH\n",
+      "QSRRMPPQPC\n",
+      "NRDACRLNHA\n",
+      "\n",
+      "Longest valid peptide sequence:\n",
+      "MRRDW\n",
+      "\n",
+      "Longest valid peptide sequence per frame:\n",
+      "MRRDW\n",
+      "MVEAACVAI\n",
+      "\n",
+      "MH\n",
+      "MPPQPC\n",
+      "\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "Human Dataset\n",
+    "\n",
+    "1.1) Count number of unique genes from the file \"genes_human_genenames_duplicates.txt\" and write the gene names in a new file."
+   ],
+   "metadata": {
+    "id": "ey-lNS6d9XIQ",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from google.colab import drive\n",
+    "drive.mount('/content/drive')\n",
+    "\n",
+    "with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/genes_human_genenames_duplicates.txt\", 'r') as genes:\n",
+    "   with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/gene_names_unique_human.txt\", 'w') as new_file:\n",
+    "      unique_genes = []\n",
+    "      line = genes.readline()\n",
+    "      while line:\n",
+    "        if '.' in line:\n",
+    "          line = line[:line.find('.')]    # dismiss the transcript number\n",
+    "        if not line in unique_genes:    # Check if the gene was already encountered and write it in the new file if not\n",
+    "          unique_genes.append(line)\n",
+    "          new_file.write(line)\n",
+    "        line = genes.readline()\n",
+    "    "
+   ],
+   "metadata": {
+    "id": "K_sqSkLx90lo",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "49045a1e-a48e-494e-ba71-82de736c9e84",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "execution_count": 6,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Mounted at /content/drive\n"
+     ]
+    }
+   ]
+  }
+ ]
+}
\ No newline at end of file
+%% Cell type:markdown id: tags:
+
+# Python course 2021 - Exercises F
+
+%% Cell type:markdown id: tags:
+
+## Part1 - DNA, RNA and peptide sequences
+
+%% Cell type:markdown id: tags:
+
+
+
+---
+1.1) Write a function to get the reverse complement (upper case letters) of a DNA sequence given in upper case letters!
+
+%% Cell type:code id: tags:
+
+``` 
+def get_reverse_complement(sequence):
+  bases = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
+
+  rev_comp = []
+  for i in range(len(sequence)-1, -1, -1):
+    rev_comp += [bases[sequence[i]]]
+
+  return ''.join(rev_comp)
+
+sequence_1 = "AGACGTA"
+print(sequence_1)
+print(get_reverse_complement(sequence_1))
+
+sequence_2 = "TTTGACGTAT"
+print(sequence_2)
+print(get_reverse_complement(sequence_2))
+```
+
+%% Output
+
+    AGACGTA
+    TACGTCT
+    TTTGACGTAT
+    ATACGTCAAA
+
+%% Cell type:markdown id: tags:
+
+
+
+---
+1.2) Write a function to convert a DNA sequence into a RNA sequence!
+
+%% Cell type:code id: tags:
+
+``` 
+def convert_DNA_to_RNA(sequence):
+  rna_sequence = []
+  for character in sequence:
+    if character == "T":
+      rna_sequence += ["U"]
+    else:
+      rna_sequence += [character]
+
+  return ''.join(rna_sequence)
+
+print(sequence_1)
+print(convert_DNA_to_RNA(sequence_1))
+
+print(sequence_2)
+print(convert_DNA_to_RNA(sequence_2))
+```
+
+%% Output
+
+    AGACGTA
+    AGACGUA
+    TTTGACGTAT
+    UUUGACGUAU
+
+%% Cell type:markdown id: tags:
+
+
+
+---
+1.3) Write a function to translate a DNA sequence into amino acids (first frame only)!
+* Tip: [wiki - codon tables](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)
+
+%% Cell type:code id: tags:
+
+``` 
+codons = {
+  "TTT": "F", "TTC": "F",
+  "TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
+  "ATT": "I", "ATC": "I",  "ATA": "I",
+  "ATG": "M",
+  "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
+  "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
+  "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
+  "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
+  "GCT": "A","GCC": "A", "GCA": "A", "GCG": "A",
+  "TAT": "Y", "TAC": "Y",
+  "CAT": "H", "CAC": "H",
+  "CAA": "Q", "CAG": "Q",
+  "AAT": "N", "AAC": "N",
+  "AAA": "K", "AAG": "K",
+  "GAT": "D", "GAC": "D",
+  "GAA": "E","GAG": "E",
+  "TGT": "C", "TGC": "C",
+  "TGG": "W",
+  "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
+  "AGT": "S", "AGC": "S",
+  "AGA": "R", "AGG": "R",
+  "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
+  "TGA": "*", "TAA": "*", "TAG": "*",
+}
+
+def translate(sequence):
+  peptide_sequence = []
+  for i in range(0, len(sequence) - 2, 3):
+    peptide_sequence += [codons[sequence[i: i+3]]]
+  return ''.join(peptide_sequence)
+
+sequence = "ATGCATGGTTGAGGCGGCATGCGTCGCGATTGG"
+print(translate(sequence))
+```
+
+%% Output
+
+    MHG*GGMRRDW
+
+%% Cell type:markdown id: tags:
+
+
+
+---
+1.4) Write a function to translate DNA sequences in all 6 frames into peptide sequences! The longest peptide sequence per DNA sequence should be returned!
+
+%% Cell type:code id: tags:
+
+``` 
+"""
+Translates a DNA sequence in all 6 frames into peptide sequences
+and yields the peptide sequences.
+"""
+def translate_all_frames(sequence):
+  for i in range(3):
+    yield translate(sequence[i:])
+
+  rev_comp = get_reverse_complement(sequence)
+  for i in range(3):
+    yield translate(rev_comp[i:])
+
+"""
+Find a longest valid peptide sequence, meaning one that starts with
+M, in a peptide sequence.
+"""
+def get_longest_peptide_sequence(sequence):
+  longest_length = 0
+  longest_peptide_sequence = ""
+
+  current_sequence = []
+  currently_in_sequence = False
+  for peptide in sequence:
+    if currently_in_sequence:
+      # encounter a stop codon
+      if peptide == "*":
+        if current_sequence:
+          # update longest observed sequence and length if necessary
+          if len(current_sequence) > longest_length:
+            longest_length = len(current_sequence)
+            longest_peptide_sequence = ''.join(current_sequence)
+        # clear current sequence
+        current_sequence = []
+        currently_in_sequence = False
+      else:
+        # sequence extend continues
+        current_sequence += [peptide]
+    # currently not in a valid peptide sequence
+    else:
+      # encounter a start peptide
+      if peptide == "M":
+        current_sequence += [peptide]
+        currently_in_sequence = True
+      # if we are not in a valid peptide sequence and the current peptide is not
+      # a start peptide, simply continue
+
+  if current_sequence:
+    if len(current_sequence) > longest_length:
+      longest_length = len(current_sequence)
+      longest_peptide_sequence = ''.join(current_sequence)
+
+  return longest_peptide_sequence
+
+"""
+Computes a longest valid peptide sequence for each of the 6 frames and yields it.
+"""
+def longest_peptide_sequence_per_frame(sequence):
+  for frame in translate_all_frames(sequence):
+    yield get_longest_peptide_sequence(frame)
+
+print("Reading frames:")
+for frame in translate_all_frames(sequence):
+  print(frame)
+print("")
+
+print("Longest valid peptide sequence:")
+print(get_longest_peptide_sequence(translate(sequence)))
+print("")
+
+print("Longest valid peptide sequence per frame:")
+for longest_peptide_sequence in longest_peptide_sequence_per_frame(sequence):
+  print(longest_peptide_sequence)
+```
+
+%% Output
+
+    Reading frames:
+    MHG*GGMRRDW
+    CMVEAACVAI
+    AWLRRHASRL
+    PIATHAASTMH
+    QSRRMPPQPC
+    NRDACRLNHA
+    
+    Longest valid peptide sequence:
+    MRRDW
+    
+    Longest valid peptide sequence per frame:
+    MRRDW
+    MVEAACVAI
+    
+    MH
+    MPPQPC
+    
+
+%% Cell type:markdown id: tags:
+
+---
+Human Dataset
+
+1.1) Count number of unique genes from the file "genes_human_genenames_duplicates.txt" and write the gene names in a new file.
+
+%% Cell type:code id: tags:
+
+``` 
+from google.colab import drive
+drive.mount('/content/drive')
+
+with open("/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/genes_human_genenames_duplicates.txt", 'r') as genes:
+   with open("/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/gene_names_unique_human.txt", 'w') as new_file:
+      unique_genes = []
+      line = genes.readline()
+      while line:
+        if '.' in line:
+          line = line[:line.find('.')]    # dismiss the transcript number
+        if not line in unique_genes:    # Check if the gene was already encountered and write it in the new file if not
+          unique_genes.append(line)
+          new_file.write(line)
+        line = genes.readline()
+
+```
+
+%% Output
+
+    Mounted at /content/drive