From 92ea647621113207e225d92eab0c231b86b6443f Mon Sep 17 00:00:00 2001
From: Franziska Niemeyer <franziska.niemeyer@uni-bielefeld.de>
Date: Mon, 8 Aug 2022 09:28:14 +0200
Subject: [PATCH] Upload solutions for exercises_F

---
 .../Python_course_2021_exercises_F.ipynb      | 417 ++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 Exercises/solutions/Python_course_2021_exercises_F.ipynb

diff --git a/Exercises/solutions/Python_course_2021_exercises_F.ipynb b/Exercises/solutions/Python_course_2021_exercises_F.ipynb
new file mode 100644
index 0000000..c58df78
--- /dev/null
+++ b/Exercises/solutions/Python_course_2021_exercises_F.ipynb
@@ -0,0 +1,417 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "name": "Python_course_2021_exercises_F.ipynb",
+   "provenance": [],
+   "collapsed_sections": []
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "C6hjQZrrO1lx",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Python course 2021 - Exercises F"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RoMsf4tCO3CA",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "## Part1 - DNA, RNA and peptide sequences"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3_wMYLkSPBrx",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.1) Write a function to get the reverse complement (upper case letters) of a DNA sequence given in upper case letters!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "Uq70MpOBPMpe",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "012a7aed-2e4e-4ac4-d32c-2b1b4e8c3784",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "def get_reverse_complement(sequence):\n",
+    "  bases = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}\n",
+    "\n",
+    "  rev_comp = []\n",
+    "  for i in range(len(sequence)-1, -1, -1):\n",
+    "    rev_comp += [bases[sequence[i]]]\n",
+    "\n",
+    "  return ''.join(rev_comp)\n",
+    "\n",
+    "sequence_1 = \"AGACGTA\"\n",
+    "print(sequence_1)\n",
+    "print(get_reverse_complement(sequence_1))\n",
+    "\n",
+    "sequence_2 = \"TTTGACGTAT\"\n",
+    "print(sequence_2)\n",
+    "print(get_reverse_complement(sequence_2))"
+   ],
+   "execution_count": 1,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "AGACGTA\n",
+      "TACGTCT\n",
+      "TTTGACGTAT\n",
+      "ATACGTCAAA\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "trElcFlcPMzg",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.2) Write a function to convert a DNA sequence into a RNA sequence!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "ahMEGCWTPUAY",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "4f6e4e43-6b56-460b-fac9-4407444fd236",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "def convert_DNA_to_RNA(sequence):\n",
+    "  rna_sequence = []\n",
+    "  for character in sequence:\n",
+    "    if character == \"T\":\n",
+    "      rna_sequence += [\"U\"]\n",
+    "    else:\n",
+    "      rna_sequence += [character]\n",
+    "\n",
+    "  return ''.join(rna_sequence)\n",
+    "\n",
+    "print(sequence_1)\n",
+    "print(convert_DNA_to_RNA(sequence_1))\n",
+    "\n",
+    "print(sequence_2)\n",
+    "print(convert_DNA_to_RNA(sequence_2))"
+   ],
+   "execution_count": 2,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "AGACGTA\n",
+      "AGACGUA\n",
+      "TTTGACGTAT\n",
+      "UUUGACGUAU\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EOMgpzlnPUJ6",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.3) Write a function to translate a DNA sequence into amino acids (first frame only)!\n",
+    "* Tip: [wiki - codon tables](https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "aFVhE4KEPrM4",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "d64ea5ed-0a79-4d75-95ae-4841ce40a6a3",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "codons = {\n",
+    "  \"TTT\": \"F\", \"TTC\": \"F\",\n",
+    "  \"TTA\": \"L\", \"TTG\": \"L\", \"CTT\": \"L\", \"CTC\": \"L\", \"CTA\": \"L\", \"CTG\": \"L\",\n",
+    "  \"ATT\": \"I\", \"ATC\": \"I\",  \"ATA\": \"I\",\n",
+    "  \"ATG\": \"M\",\n",
+    "  \"GTT\": \"V\", \"GTC\": \"V\", \"GTA\": \"V\", \"GTG\": \"V\",\n",
+    "  \"TCT\": \"S\", \"TCC\": \"S\", \"TCA\": \"S\", \"TCG\": \"S\",\n",
+    "  \"CCT\": \"P\", \"CCC\": \"P\", \"CCA\": \"P\", \"CCG\": \"P\",\n",
+    "  \"ACT\": \"T\", \"ACC\": \"T\", \"ACA\": \"T\", \"ACG\": \"T\",\n",
+    "  \"GCT\": \"A\",\"GCC\": \"A\", \"GCA\": \"A\", \"GCG\": \"A\",\n",
+    "  \"TAT\": \"Y\", \"TAC\": \"Y\",\n",
+    "  \"CAT\": \"H\", \"CAC\": \"H\",\n",
+    "  \"CAA\": \"Q\", \"CAG\": \"Q\",\n",
+    "  \"AAT\": \"N\", \"AAC\": \"N\",\n",
+    "  \"AAA\": \"K\", \"AAG\": \"K\",\n",
+    "  \"GAT\": \"D\", \"GAC\": \"D\",\n",
+    "  \"GAA\": \"E\",\"GAG\": \"E\",\n",
+    "  \"TGT\": \"C\", \"TGC\": \"C\",\n",
+    "  \"TGG\": \"W\",\n",
+    "  \"CGT\": \"R\", \"CGC\": \"R\", \"CGA\": \"R\", \"CGG\": \"R\",\n",
+    "  \"AGT\": \"S\", \"AGC\": \"S\",\n",
+    "  \"AGA\": \"R\", \"AGG\": \"R\",\n",
+    "  \"GGT\": \"G\", \"GGC\": \"G\", \"GGA\": \"G\", \"GGG\": \"G\",\n",
+    "  \"TGA\": \"*\", \"TAA\": \"*\", \"TAG\": \"*\",\n",
+    "}\n",
+    "\n",
+    "def translate(sequence):\n",
+    "  peptide_sequence = []\n",
+    "  for i in range(0, len(sequence) - 2, 3):\n",
+    "    peptide_sequence += [codons[sequence[i: i+3]]]\n",
+    "  return ''.join(peptide_sequence)\n",
+    "\n",
+    "sequence = \"ATGCATGGTTGAGGCGGCATGCGTCGCGATTGG\"\n",
+    "print(translate(sequence))"
+   ],
+   "execution_count": 3,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "MHG*GGMRRDW\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "3-KHtLeePsQ4",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "\n",
+    "\n",
+    "---\n",
+    "1.4) Write a function to translate DNA sequences in all 6 frames into peptide sequences! The longest peptide sequence per DNA sequence should be returned!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "id": "s50AlgALP8I3",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "436613c5-0516-4f89-dc88-00427ecab66c",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "source": [
+    "\"\"\"\n",
+    "Translates a DNA sequence in all 6 frames into peptide sequences\n",
+    "and yields the peptide sequences.\n",
+    "\"\"\"\n",
+    "def translate_all_frames(sequence):\n",
+    "  for i in range(3):\n",
+    "    yield translate(sequence[i:])\n",
+    "  \n",
+    "  rev_comp = get_reverse_complement(sequence)\n",
+    "  for i in range(3):\n",
+    "    yield translate(rev_comp[i:])\n",
+    "\n",
+    "\"\"\"\n",
+    "Find a longest valid peptide sequence, meaning one that starts with\n",
+    "M, in a peptide sequence.\n",
+    "\"\"\"\n",
+    "def get_longest_peptide_sequence(sequence):\n",
+    "  longest_length = 0\n",
+    "  longest_peptide_sequence = \"\"\n",
+    "  \n",
+    "  current_sequence = []\n",
+    "  currently_in_sequence = False\n",
+    "  for peptide in sequence:\n",
+    "    if currently_in_sequence:\n",
+    "      # encounter a stop codon\n",
+    "      if peptide == \"*\":\n",
+    "        if current_sequence:\n",
+    "          # update longest observed sequence and length if necessary\n",
+    "          if len(current_sequence) > longest_length:\n",
+    "            longest_length = len(current_sequence)\n",
+    "            longest_peptide_sequence = ''.join(current_sequence)\n",
+    "        # clear current sequence\n",
+    "        current_sequence = []\n",
+    "        currently_in_sequence = False\n",
+    "      else:\n",
+    "        # sequence extend continues\n",
+    "        current_sequence += [peptide]\n",
+    "    # currently not in a valid peptide sequence\n",
+    "    else:\n",
+    "      # encounter a start peptide\n",
+    "      if peptide == \"M\":\n",
+    "        current_sequence += [peptide]\n",
+    "        currently_in_sequence = True\n",
+    "      # if we are not in a valid peptide sequence and the current peptide is not\n",
+    "      # a start peptide, simply continue\n",
+    "\n",
+    "  if current_sequence:\n",
+    "    if len(current_sequence) > longest_length:\n",
+    "      longest_length = len(current_sequence)\n",
+    "      longest_peptide_sequence = ''.join(current_sequence)\n",
+    "\n",
+    "  return longest_peptide_sequence\n",
+    "\n",
+    "\"\"\"\n",
+    "Computes a longest valid peptide sequence for each of the 6 frames and yields it.\n",
+    "\"\"\"\n",
+    "def longest_peptide_sequence_per_frame(sequence):\n",
+    "  for frame in translate_all_frames(sequence):\n",
+    "    yield get_longest_peptide_sequence(frame)\n",
+    "\n",
+    "print(\"Reading frames:\")\n",
+    "for frame in translate_all_frames(sequence):\n",
+    "  print(frame)\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Longest valid peptide sequence:\")\n",
+    "print(get_longest_peptide_sequence(translate(sequence)))\n",
+    "print(\"\")\n",
+    "\n",
+    "print(\"Longest valid peptide sequence per frame:\")\n",
+    "for longest_peptide_sequence in longest_peptide_sequence_per_frame(sequence):\n",
+    "  print(longest_peptide_sequence)"
+   ],
+   "execution_count": 4,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Reading frames:\n",
+      "MHG*GGMRRDW\n",
+      "CMVEAACVAI\n",
+      "AWLRRHASRL\n",
+      "PIATHAASTMH\n",
+      "QSRRMPPQPC\n",
+      "NRDACRLNHA\n",
+      "\n",
+      "Longest valid peptide sequence:\n",
+      "MRRDW\n",
+      "\n",
+      "Longest valid peptide sequence per frame:\n",
+      "MRRDW\n",
+      "MVEAACVAI\n",
+      "\n",
+      "MH\n",
+      "MPPQPC\n",
+      "\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "Human Dataset\n",
+    "\n",
+    "1.1) Count number of unique genes from the file \"genes_human_genenames_duplicates.txt\" and write the gene names in a new file."
+   ],
+   "metadata": {
+    "id": "ey-lNS6d9XIQ",
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from google.colab import drive\n",
+    "drive.mount('/content/drive')\n",
+    "\n",
+    "with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/genes_human_genenames_duplicates.txt\", 'r') as genes:\n",
+    "   with open(\"/content/drive/MyDrive/ColabNotebooks/UniPythonCourse/Exercises/data/gene_names_unique_human.txt\", 'w') as new_file:\n",
+    "      unique_genes = []\n",
+    "      line = genes.readline()\n",
+    "      while line:\n",
+    "        if '.' in line:\n",
+    "          line = line[:line.find('.')]    # dismiss the transcript number\n",
+    "        if not line in unique_genes:    # Check if the gene was already encountered and write it in the new file if not\n",
+    "          unique_genes.append(line)\n",
+    "          new_file.write(line)\n",
+    "        line = genes.readline()\n",
+    "    "
+   ],
+   "metadata": {
+    "id": "K_sqSkLx90lo",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "49045a1e-a48e-494e-ba71-82de736c9e84",
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "execution_count": 6,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Mounted at /content/drive\n"
+     ]
+    }
+   ]
+  }
+ ]
+}
\ No newline at end of file
-- 
GitLab