{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Python_course_2021_exercises_C.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FyvebZ68I8BJ"
      },
      "source": [
        "# Python course 2021 - Exercises C"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8PgJ1ymVJCIO"
      },
      "source": [
        "## Part1 - file handling"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ws7tJiEXJG8f"
      },
      "source": [
        "\n",
        "\n",
        "---\n",
        "1.1) Count number of sequences (number of headers) in AtCol0_Exons.fasta!\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JzgmMxR0JVxL",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "52d24ee3-445c-4b78-9b91-52de52a9791d"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oEJidIAEy8S5"
      },
      "source": [
        "datei = open(\"/content/drive/MyDrive/PythonProgramming/AtCol0_Exons.fasta\", \"r\")\n",
        "lines = datei.readlines()\n",
        "datei.close()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def get_num_headers(lines):\n",
        "  num_headers = 0\n",
        "  for line in lines:\n",
        "    if line:\n",
        "      if line[0] == \">\":\n",
        "        num_headers += 1\n",
        "  return num_headers\n",
        "\n",
        "print(get_num_headers(lines))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "q8s7_9qRxa_b",
        "outputId": "ad4a42f6-4f24-42bd-8508-0d5e92d59347"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "217183\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rFtHoz5UKujx"
      },
      "source": [
        "\n",
        "\n",
        "---\n",
        "1.2) Count number of sequence lines!\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AgMttuZlKyBg",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "88d9aa00-aa17-4778-e291-6d70cebfa67b"
      },
      "source": [
        "def get_num_sequence_lines(lines):\n",
        "  num_sequence_lines = 0\n",
        "  for line in lines:\n",
        "    if line:\n",
        "      if line[0] != \">\":\n",
        "        num_sequence_lines += 1\n",
        "  return num_sequence_lines\n",
        "\n",
        "print(get_num_sequence_lines(lines))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "916024\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YTH3rkjJKyNm"
      },
      "source": [
        "\n",
        "\n",
        "---\n",
        "1.3) Count number of characters in document! (How many per line?)\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6ECkHsa9K3-X",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "fe3ccf0a-8afa-4b7f-ba19-e3870430989f"
      },
      "source": [
        "def get_num_characters(lines):\n",
        "  num_characters = 0\n",
        "  num_lines = 0\n",
        "  for line in lines:\n",
        "    num_characters += len(line)\n",
        "    num_lines += 1\n",
        "  return (num_characters, num_characters / num_lines)\n",
        "\n",
        "print(get_num_characters(lines))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(81803755, 72.18783064347467)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "I9bkusUsK4HV"
      },
      "source": [
        "\n",
        "\n",
        "---\n",
        "1.4) How long are all contained sequences combined?\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XC4que0hK81W",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e77692ed-7a33-47ef-d8ee-5ac9550535fe"
      },
      "source": [
        "def get_sequence_length(lines):\n",
        "  total_sequence_length = 0\n",
        "  for line in lines:\n",
        "    if line:\n",
        "      if line[0] != \">\":\n",
        "        line = line.strip()\n",
        "        total_sequence_length += len(line)\n",
        "  return total_sequence_length\n",
        "\n",
        "print(get_sequence_length(lines))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "64867051\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5NnxagAWK9AP"
      },
      "source": [
        "\n",
        "\n",
        "---\n",
        "1.5) Calculate the average sequence length in this file!\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MZNV3sNqLB62",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "01b885d6-dc65-4b5d-f893-288015678122"
      },
      "source": [
        "def get_average_sequence_length(lines):\n",
        "  return get_sequence_length(lines) / get_num_headers(lines)\n",
        "\n",
        "print(get_average_sequence_length(lines))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "298.67462462531597\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Additional exercises**"
      ],
      "metadata": {
        "id": "n9rZsJ5_4hTJ"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.6) Parse the fasta file entry-wise. An entry consists of a header and the corresponding sequence (which may comprise multiple lines). The result should be a list of tuples of the form (header, sequence)."
      ],
      "metadata": {
        "id": "ItrnPkVE5fsv"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\"\"\"\n",
        "Parse a fasta file entry-wise as a list of tuples of the form (header, sequence).\n",
        "\"\"\"\n",
        "def read_fasta(file):\n",
        "  result = []\n",
        "\n",
        "  header = None\n",
        "  sequence = []\n",
        "  for line in file:\n",
        "    # remove all whitespace from the ends\n",
        "    line = line.strip()\n",
        "    if line.startswith('>'):\n",
        "      # if you find a header return the previous FASTA block in tuple form after\n",
        "      # concatenating the sequence lines(if there is a previous block)\n",
        "      if header:\n",
        "        result += [(header, ''.join(sequence))]\n",
        "\n",
        "      header = line\n",
        "      sequence = []\n",
        "    else:\n",
        "      # current line is not a header\n",
        "      # add line to the list of sequence lines of the current FASTA block after removing all whitespace from it\n",
        "      sequence.append(line.translate(str.maketrans('', '', whitespace)))\n",
        "  \n",
        "  if header:\n",
        "    result += [(header, ''.join(sequence))]\n",
        "  return result"
      ],
      "metadata": {
        "id": "RvF09FlO6YeT"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}