diff --git a/Exercises/solutions/Python_course_2021_exercises_B.ipynb b/Exercises/solutions/Python_course_2021_exercises_B.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cb18b5ad881040cb5468a57e2846755984fee343 --- /dev/null +++ b/Exercises/solutions/Python_course_2021_exercises_B.ipynb @@ -0,0 +1,784 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Python_course_2021_exercises_B.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "9vv0rpCuGI-0" + }, + "source": [ + "# Python course 2021 - Exercises B\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MmKoM-5LGKuq" + }, + "source": [ + "## Part1 - control structures" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G7FusG7CGbta" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.1) Write a script for guessing numbers!\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0q2YJKgCGjTT" + }, + "source": [ + "\n", + "\n", + "---\n", + "1.2) Add tips (smaller/larger) during the guessing process!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "scIcmdvJGjAe", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2a817a7d-fd02-4f34-a81f-5b30291858b3" + }, + "source": [ + "import random\n", + "\n", + "def guessing_game(num_tries, upper_limit):\n", + " true_number = random.randrange(upper_limit)\n", + " \n", + " for i in range(num_tries):\n", + " user_input = int(input(\"Enter a number: \"))\n", + "\n", + " if (user_input == true_number):\n", + " print(\"Correct! You win the game\")\n", + " return\n", + " elif (user_input < true_number):\n", + " print(\"Too low! Guess a higher number\")\n", + " else:\n", + " print(\"Too high! Guess a lower number\")\n", + "\n", + " print(\"You are out of attempts. Better luck next time\")\n", + " print(f\"The correct number was {true_number}\")\n", + "\n", + "guessing_game(3, 10)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Enter a number: 5\n", + "Too high! Guess a lower number\n", + "Enter a number: 2\n", + "Correct! You win the game\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "69vKvpfXG1O0" + }, + "source": [ + "## Part2 - loops" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WmbSBvCGG6Fc" + }, + "source": [ + "\n", + "\n", + "---\n", + "2.1) Write a function counting to 100 and printing all numbers which can be divided by 4 without any residue!\n", + "\n", + "* Info: 10%2 #modulo division in Python\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "N7JQLOS4HYIc", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7a8a1bd0-e7bd-4ea3-9da1-99354ab0b437" + }, + "source": [ + "def get_multiples_of_four(limit):\n", + " multiples = []\n", + " for i in range(0, limit, 4):\n", + " multiples += [i]\n", + " print(multiples)\n", + "\n", + "get_multiples_of_four(100)" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G3ocU6nnHYTp" + }, + "source": [ + "\n", + "\n", + "---\n", + "2.2) Write a function counting down from 1000 to 0 and printing all numbers!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Zr_WTneeHgo4", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "72faa53c-7688-4bf4-bf9d-d53f9f6839d3" + }, + "source": [ + "def countdown(start):\n", + " for i in range(start, -1, -1):\n", + " print(i)\n", + "\n", + "countdown(10)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "10\n", + "9\n", + "8\n", + "7\n", + "6\n", + "5\n", + "4\n", + "3\n", + "2\n", + "1\n", + "0\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F_k9KW4gHgwg" + }, + "source": [ + "\n", + "\n", + "---\n", + "2.3) Generate a list of species names! Write a function printing all species names starting with \"E\"!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "daVLjj-aHo_C", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2106843b-a1f7-4a5a-cbac-1aecc9c38e2c" + }, + "source": [ + "species = [\"D. melanogaster\", \"M. musculus\", \"E. coli\", \"C. elegans\", \"H. sapiens\", \"B. napus\", \"B. vulgaris\", \"E. multilocularis\", \"E. a\"]\n", + "\n", + "def filter_species_0(species):\n", + " filtered_species = [name for name in species if name[0] == \"E\"]\n", + " return filtered_species\n", + "\n", + "print(filter_species_0(species))" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['E. coli', 'E. multilocularis', 'E. a']\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vC_BZDXZHpJA" + }, + "source": [ + "\n", + "\n", + "---\n", + "2.4) Expand this function to limit the printing to species names which are additionally shorter than 10 characters!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BS1PycUwHydi", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "ad0a4ddc-b0a3-4c69-aea9-e6e94902df90" + }, + "source": [ + "def filter_species_1(species):\n", + " filtered_species = filter_species_0(species)\n", + " filtered_species = [name for name in filtered_species if len(name) < 10]\n", + " return filtered_species\n", + "\n", + "print(filter_species_1(species))" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['E. coli', 'E. a']\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C0aWx374HykI" + }, + "source": [ + "\n", + "\n", + "---\n", + "2.5) Expand this function to limit the printing to species names which are additionally ending with \"a\".\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2dO2405CH8Ab", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1c026851-6fe2-4784-8cd6-d06995ad39c1" + }, + "source": [ + "def filter_species_2(species):\n", + " filtered_species = filter_species_1(species)\n", + " filtered_species = [name for name in filtered_species if name[-1] == \"a\"]\n", + " return filtered_species\n", + "\n", + "print(filter_species_2(species))" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['E. a']\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Additional exercises**" + ], + "metadata": { + "id": "Al6Bv0jlTTfG" + } + }, + { + "cell_type": "markdown", + "source": [ + "2.6) Load 4-6 protein sequences into a list and search them for specific motive, e.g. \"VAL\". You should only return those sequences that contain the motive. Additional: where does the motive lie?" + ], + "metadata": { + "id": "PSJds5r8NqmP" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + "Protein sequences are taken from UniProt.\n", + "P01308 (insulin, H. sapiens)\n", + "P68871 (hemoglobin subunit beta, H. sapiens)\n", + "O22264 (transcription factor MYB12, A. thaliana)\n", + "P19821 (DNA polymerase I, thermostable, Thermus aquaticus)\n", + "\"\"\"\n", + "proteins = [\n", + " \"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGA\"\n", + " + \"GSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN\",\n", + " \"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSD\"\n", + " + \"GLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH\",\n", + " \"MGRAPCCEKVGIKRGRWTAEEDQILSNYIQSNGEGSWRSLPKNAGLKRCGKSCRLRWINYLRSDLKRGNITPEE\"\n", + " + \"EELVVKLHSTLGNRWSLIAGHLPGRTDNEIKNYWNSHLSRKLHNFIRKPSISQDVSAVIMTNASSAPPPPQA\"\n", + " + \"KRRLGRTSRSAMKPKIHRTKTRKTKKTSAPPEPNADVAGADKEALMVESSGAEAELGRPCDYYGDDCNKNLM\"\n", + " + \"SINGDNGVLTFDDDIIDLLLDESDPGHLYTNTTCGGDGELHNIRDSEGARGFSDTWNQGNLDCLLQSCPSVE\"\n", + " + \"SFLNYDHQVNDASTDEFIDWDCVWQEGSDNNLWHEKENPDSMVSWLLDGDDEATIGNSNCENFGEPLDHDDE\"\n", + " + \"SALVAWLLS\",\n", + " \"MRGMLPLFEPKGRVLLVDGHHLAYRTFHALKGLTTSRGEPVQAVYGFAKSLLKALKEDGDAVIVVFDAKAPSFR\"\n", + " + \"HEAYGGYKAGRAPTPEDFPRQLALIKELVDLLGLARLEVPGYEADDVLASLAKKAEKEGYEVRILTADKDLY\"\n", + " + \"QLLSDRIHVLHPEGYLITPAWLWEKYGLRPDQWADYRALTGDESDNLPGVKGIGEKTARKLLEEWGSLEALL\"\n", + " + \"KNLDRLKPAIREKILAHMDDLKLSWDLAKVRTDLPLEVDFAKRREPDRERLRAFLERLEFGSLLHEFGLLES\"\n", + " + \"PKALEEAPWPPPEGAFVGFVLSRKEPMWADLLALAAARGGRVHRAPEPYKALRDLKEARGLLAKDLSVLALR\"\n", + " + \"EGLGLPPGDDPMLLAYLLDPSNTTPEGVARRYGGEWTEEAGERAALSERLFANLWGRLEGEERLLWLYREVE\"\n", + " + \"RPLSAVLAHMEATGVRLDVAYLRALSLEVAEEIARLEAEVFRLAGHPFNLNSRDQLERVLFDELGLPAIGKT\"\n", + " + \"EKTGKRSTSAAVLEALREAHPIVEKILQYRELTKLKSTYIDPLPDLIHPRTGRLHTRFNQTATATGRLSSSD\"\n", + " + \"PNLQNIPVRTPLGQRIRRAFIAEEGWLLVALDYSQIELRVLAHLSGDENLIRVFQEGRDIHTETASWMFGVP\"\n", + " + \"REAVDPLMRRAAKTINFGVLYGMSAHRLSQELAIPYEEAQAFIERYFQSFPKVRAWIEKTLEEGRRRGYVET\"\n", + " + \"LFGRRRYVPDLEARVKSVREAAERMAFNMPVQGTAADLMKLAMVKLFPRLEEMGARMLLQVHDELVLEAPKE\"\n", + " + \"RAEAVARLAKEVMEGVYPLAVPLEVEVGIGEDWLSAKE\"\n", + "]" + ], + "metadata": { + "id": "MYxzFmiE4ax2" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + "Find the first occurrence of a motive in each protein.\n", + "If there exists an occurrence, returns the protein together with\n", + "the position of the first occurrence of the motive.\n", + "\n", + "Implemented as a generator for more flexibility.\n", + "\"\"\"\n", + "def find_motive(proteins, motive):\n", + " for protein in proteins:\n", + " # find returns the index of the first occurrence of the search string\n", + " # and -1 if no occurrence can be found\n", + " occurrence = protein.find(motive)\n", + " if occurrence > -1:\n", + " yield (occurrence, protein)\n", + "\n", + "motive = \"MA\"\n", + "for entry in find_motive(proteins, motive):\n", + " print(entry)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tM9MxtgcxxSN", + "outputId": "95b6650a-9316-4825-c7b4-76aff1c26242" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(0, 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN')\n", + "(746, 'MRGMLPLFEPKGRVLLVDGHHLAYRTFHALKGLTTSRGEPVQAVYGFAKSLLKALKEDGDAVIVVFDAKAPSFRHEAYGGYKAGRAPTPEDFPRQLALIKELVDLLGLARLEVPGYEADDVLASLAKKAEKEGYEVRILTADKDLYQLLSDRIHVLHPEGYLITPAWLWEKYGLRPDQWADYRALTGDESDNLPGVKGIGEKTARKLLEEWGSLEALLKNLDRLKPAIREKILAHMDDLKLSWDLAKVRTDLPLEVDFAKRREPDRERLRAFLERLEFGSLLHEFGLLESPKALEEAPWPPPEGAFVGFVLSRKEPMWADLLALAAARGGRVHRAPEPYKALRDLKEARGLLAKDLSVLALREGLGLPPGDDPMLLAYLLDPSNTTPEGVARRYGGEWTEEAGERAALSERLFANLWGRLEGEERLLWLYREVERPLSAVLAHMEATGVRLDVAYLRALSLEVAEEIARLEAEVFRLAGHPFNLNSRDQLERVLFDELGLPAIGKTEKTGKRSTSAAVLEALREAHPIVEKILQYRELTKLKSTYIDPLPDLIHPRTGRLHTRFNQTATATGRLSSSDPNLQNIPVRTPLGQRIRRAFIAEEGWLLVALDYSQIELRVLAHLSGDENLIRVFQEGRDIHTETASWMFGVPREAVDPLMRRAAKTINFGVLYGMSAHRLSQELAIPYEEAQAFIERYFQSFPKVRAWIEKTLEEGRRRGYVETLFGRRRYVPDLEARVKSVREAAERMAFNMPVQGTAADLMKLAMVKLFPRLEEMGARMLLQVHDELVLEAPKERAEAVARLAKEVMEGVYPLAVPLEVEVGIGEDWLSAKE')\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "2.7) What is the amino acid composition of the proteins? Which amino acid occurs most rarely?" + ], + "metadata": { + "id": "8CO8eqBsON-S" + } + }, + { + "cell_type": "code", + "source": [ + "def get_sequence_composition(sequence):\n", + " # dictionary saving the number of observed occurrences of each character\n", + " num_occurrences = {}\n", + " for character in sequence:\n", + " # setdefault returns the value of the key if the key is already in the dictionary\n", + " # otherwise it returns the default value (here 0) and adds the (key, default) pair\n", + " # to the dictionary\n", + " num_occurrences[character] = num_occurrences.setdefault(character, 0) + 1\n", + " return num_occurrences\n", + "\n", + "def get_rarest_symbol(sequence):\n", + " num_occurrences = get_sequence_composition(sequence)\n", + " min_occurrences = len(sequence) + 1\n", + " rarest_symbol = \"\"\n", + " for symbol, occurrences in num_occurrences.items():\n", + " if occurrences < min_occurrences:\n", + " min_occurrences = occurrences\n", + " rarest_symbol = symbol\n", + " return (rarest_symbol, min_occurrences)\n", + "\n", + "print(get_sequence_composition(proteins[0]))\n", + "print(get_rarest_symbol(proteins[0]))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KqiT9sYX3yfW", + "outputId": "619123e6-58aa-4039-9b7e-dbd10c9bdd43" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'M': 2, 'A': 10, 'L': 20, 'W': 2, 'R': 5, 'P': 6, 'G': 12, 'D': 2, 'F': 3, 'V': 6, 'N': 3, 'Q': 7, 'H': 2, 'C': 6, 'S': 5, 'E': 8, 'Y': 4, 'T': 3, 'K': 2, 'I': 2}\n", + "('M', 2)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SL5Jk3GHIMLx" + }, + "source": [ + "## Part3 - range & enumerate" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A65Z5AeQIQrQ" + }, + "source": [ + "\n", + "\n", + "---\n", + "3.1) Write a script to print 50x \"here\" and the current value of the control variable!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kwRdBZ5lIaWb", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "94195fa5-4111-4fa3-84bc-0f230a64ea0d" + }, + "source": [ + "def print_here(iterations):\n", + " for i in range(iterations):\n", + " print(i, \"here\")\n", + "\n", + "print_here(50)" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0 here\n", + "1 here\n", + "2 here\n", + "3 here\n", + "4 here\n", + "5 here\n", + "6 here\n", + "7 here\n", + "8 here\n", + "9 here\n", + "10 here\n", + "11 here\n", + "12 here\n", + "13 here\n", + "14 here\n", + "15 here\n", + "16 here\n", + "17 here\n", + "18 here\n", + "19 here\n", + "20 here\n", + "21 here\n", + "22 here\n", + "23 here\n", + "24 here\n", + "25 here\n", + "26 here\n", + "27 here\n", + "28 here\n", + "29 here\n", + "30 here\n", + "31 here\n", + "32 here\n", + "33 here\n", + "34 here\n", + "35 here\n", + "36 here\n", + "37 here\n", + "38 here\n", + "39 here\n", + "40 here\n", + "41 here\n", + "42 here\n", + "43 here\n", + "44 here\n", + "45 here\n", + "46 here\n", + "47 here\n", + "48 here\n", + "49 here\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EXYM3E55Iaf4" + }, + "source": [ + "\n", + "\n", + "---\n", + "3.2) Write a script to walk through the species list and to print the character from the species where the index corresponds to the current control variable value!\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xf1tPln-Im6w", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7d7a8a6a-0b31-47c0-bbf0-0637be30d9f3" + }, + "source": [ + "species = [\"D. melanogaster\", \"M. musculus\", \"\", \"A\", None, \"E. coli\", \"C. elegans\", \"H. sapiens\", \"B. napus\", \"B. vulgaris\", \"E. multilocularis\", \"E. a\"]\n", + "\n", + "def print_index_char(species):\n", + " for index, name in enumerate(species):\n", + " # ignore empty names and None\n", + " if name:\n", + " # if the index is larger than the largest possible index for this name\n", + " # we need to correct it by setting it to the last valid index\n", + " corrected_index = min(index, len(name) - 1)\n", + " print(index, name[corrected_index])\n", + "\n", + "# note that indices 2 and 4 are ignored because for these species name there are no characters\n", + "print_index_char(species)" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0 D\n", + "1 .\n", + "3 A\n", + "5 l\n", + "6 g\n", + "7 e\n", + "8 s\n", + "9 i\n", + "10 c\n", + "11 a\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Additional exercises**" + ], + "metadata": { + "id": "gnw2K102CI_Y" + } + }, + { + "cell_type": "markdown", + "source": [ + "3.3) Given two arbitrary sequences *x* and *y*, find a longest common substring of *x* and *y*.\n", + "Example: *x* = ACGCTA, *y* = CGCGTA yields the result CGC." + ], + "metadata": { + "id": "DTTi41OFeguO" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + "Let x and y be two sequences over the same alphabet with lengths |x| = n, |y| = m.\n", + "\"\"\"\n", + "\n", + "\"\"\"\n", + "Finds a longest common substring of x and y naively.\n", + "\n", + "Time complexity: O(n * m^2)\n", + "Auxiliary space complexity: O(1)\n", + "\"\"\"\n", + "def longest_common_substring_naive(x, y):\n", + " length_x = len(x)\n", + " length_y = len(y)\n", + " \n", + " longest_match_length = 0\n", + " longest_match_start = 0\n", + "\n", + " for i in range(length_x):\n", + " for j in range(length_y):\n", + " current_position_x = i\n", + " current_position_y = j\n", + " current_match_length = 0\n", + " while(current_position_x < length_x and current_position_y < length_y):\n", + " if x[current_position_x] == y[current_position_y]:\n", + " current_position_x += 1\n", + " current_position_y += 1\n", + " current_match_length += 1\n", + " else:\n", + " break\n", + " if current_match_length > longest_match_length:\n", + " longest_match_length = current_match_length\n", + " longest_match_start = i\n", + "\n", + " return x[longest_match_start:longest_match_start+longest_match_length]\n", + "\n", + "\"\"\"\n", + "Find a longest common substring of x and y using dynamic programming\n", + "without any space optimizations.\n", + "Essentially we compute the longest common suffix of each combination of prefixes\n", + "of x and y. The largest of such longest common suffixes of prefixes is a\n", + "longest common substring.\n", + "The recursion formula used is\n", + "\n", + " longest_common_suffix[i-1][j-1] + 1, if x[i] = x[j]\n", + "longest_common_suffix[i][j] = \n", + " 0, otherwise\n", + "\n", + "for 1 <= i <= n, 1 <= j <= m. The recursion anchor is\n", + "\n", + "longest_common_suffix[i][0] = 0\n", + "longest_common_suffix[0][j] = 0\n", + "\n", + "for 0 <= i <= n, 0 <= j <= m.\n", + "\n", + "Time complexity: O(n * m)\n", + "Auxiliary space complexity: O(n * m)\n", + "\"\"\"\n", + "def longest_common_substring_dp(x, y):\n", + " length_x = len(x)\n", + " length_y = len(y)\n", + "\n", + " # initialize longest common suffix table\n", + " # longest_common_suffix[i][j] is the length\n", + " # of the longest common suffix of x[0:i] and y[0:j]\n", + " longest_common_suffix = [[0 for _ in range(length_y + 1)] for _ in range(length_x + 1)]\n", + "\n", + " longest_match_length = 0\n", + " longest_match_end = 0\n", + "\n", + " # compute the longest_common_suffix array row-wise\n", + " for i in range(1, length_x + 1):\n", + " for j in range(1, length_y + 1):\n", + " if (x[i-1] == y[j-1]):\n", + " longest_common_suffix[i][j] = longest_common_suffix[i-1][j-1] + 1\n", + " if longest_common_suffix[i][j] > longest_match_length:\n", + " longest_match_length = longest_common_suffix[i][j]\n", + " longest_match_end = i\n", + " else:\n", + " longest_common_suffix[i][j] = 0\n", + "\n", + " return x[longest_match_end-longest_match_length:longest_match_end]\n", + "\n", + "x = \"ACGCTA\"\n", + "x_2 = \"ACGCTAC\"\n", + "y = \"CGCGTA\"\n", + "y_2 = \"CGCGTAG\"\n", + "\n", + "print(longest_common_substring_naive(x, y))\n", + "print(longest_common_substring_dp(x,y))\n", + "print(longest_common_substring_dp(x_2, y))\n", + "print(longest_common_substring_dp(x, y_2))" + ], + "metadata": { + "id": "q3wVgvwHe0re", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f2493e8b-eb0e-497b-95ce-86e2b832b2a1" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CGC\n", + "CGC\n", + "CGC\n", + "CGC\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The auxiliary space complexity of the dynamic programming solution presented above can be optimized substantially.\n", + "\n", + "First note that the computation in that solution always only depends on the last row already computed. Therefore, it suffices to only store two rows at once decreasing the auxiliary space complexity to O(min(n,m)).\n", + "\n", + "There is still room for improvement. If you perform the computation diagonal-wise instead of row-wise, we will only need to store the last already computed element of that diagonal. This way, we can get away with O(1) auxiliary space usage.\n", + "\n", + "Another totally different solution of the longest common substring problem resolves around a data structure named generalized suffix tree. With the help of this data structure it is possibly to obtain a solution with O(n + m) time and auxiliary space complexity. However, that solution is far more difficult to implement and the relatively high constant factors in the space usage may make it prohibitive for large inputs.\n" + ], + "metadata": { + "id": "F9tF15SdxSws" + } + } + ] +} \ No newline at end of file