{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3677c309-5bc4-4954-becd-4a5779e8091d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                Filename  Total paragraphs  Total words  Words per paragraph  \\\n",
      "0    Q-AS-O/Q-AS-O_1.txt                17         1479            87.000000   \n",
      "1    Q-AS-O/Q-AS-O_2.txt                12         1449           120.750000   \n",
      "2    Q-AS-O/Q-AS-O_3.txt                14         1469           104.928571   \n",
      "3    Q-AS-O/Q-AS-O_4.txt                18         1475            81.944444   \n",
      "4    Q-AS-O/Q-AS-O_5.txt                14         1439           102.785714   \n",
      "5    Q-AS-O/Q-AS-O_6.txt                14         1435           102.500000   \n",
      "6    Q-AS-O/Q-AS-O_7.txt                14         1514           108.142857   \n",
      "7    Q-AS-O/Q-AS-O_8.txt                13         1440           110.769231   \n",
      "8    Q-AS-O/Q-AS-O_9.txt                14         1415           101.071429   \n",
      "9   Q-AS-O/Q-AS-O_10.txt                15         1501           100.066667   \n",
      "10  Q-AS-O/Q-AS-O_11.txt                12         1328           110.666667   \n",
      "11  Q-AS-O/Q-AS-O_12.txt                10         1270           127.000000   \n",
      "12  Q-AS-O/Q-AS-O_13.txt                10         1378           137.800000   \n",
      "13  Q-AS-O/Q-AS-O_14.txt                14         1435           102.500000   \n",
      "14  Q-AS-O/Q-AS-O_15.txt                12         1381           115.083333   \n",
      "\n",
      "    Total sentences  Average Sentence Length  Longest Sentence  \\\n",
      "0                64                23.109375                47   \n",
      "1                59                24.559322                59   \n",
      "2                55                26.709091                69   \n",
      "3                64                23.046875                45   \n",
      "4                51                28.215686                63   \n",
      "5                58                24.741379                55   \n",
      "6                55                27.527273                48   \n",
      "7                53                27.169811                42   \n",
      "8                53                26.698113                71   \n",
      "9                59                25.440678                46   \n",
      "10               55                24.145455                39   \n",
      "11               48                26.458333                54   \n",
      "12               50                27.560000                74   \n",
      "13               58                24.741379                92   \n",
      "14               54                25.574074                63   \n",
      "\n",
      "    Shortest Sentence  \n",
      "0                   3  \n",
      "1                  10  \n",
      "2                  13  \n",
      "3                   9  \n",
      "4                  11  \n",
      "5                  10  \n",
      "6                  12  \n",
      "7                  13  \n",
      "8                  13  \n",
      "9                  12  \n",
      "10                 10  \n",
      "11                 14  \n",
      "12                 16  \n",
      "13                  8  \n",
      "14                 15  \n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "# Define the base filename and the number of files\n",
    "base_filename = \"Q-AS-O\"\n",
    "num_files = 15  \n",
    "\n",
    "# Initialize list to collect results\n",
    "results = []\n",
    "\n",
    "# Loop through each file\n",
    "for i in range(1, num_files + 1):\n",
    "    # Construct the filename\n",
    "    filename = f\"{base_filename}/{base_filename}_{i}.txt\"\n",
    "    \n",
    "    # Check if the file exists\n",
    "    if os.path.exists(filename):\n",
    "        # Open and read the file\n",
    "        with open(filename, 'r', encoding='utf-8') as file:\n",
    "            content = file.read()\n",
    "\n",
    "        # Control for common abbreviations that end in punctuation but don't constitute sentences\n",
    "        abbreviations = [\n",
    "            \"Mr.\", \"Mrs.\", \"Ms.\", \"Dr.\", \"Prof.\", \"Sr.\", \"Jr.\", \"St.\", \"Mt.\", \"J.\", \"J.J.\", \"Col.\", \"Gen.\", \"Rev.\", \"Lt.\", \"Hon.\"\n",
    "        ]\n",
    "        for abbr in abbreviations:\n",
    "            safe_abbr = abbr.replace(\".\", \"<DOT>\")\n",
    "            content = content.replace(abbr, safe_abbr)\n",
    "\n",
    "        sentences = re.split(r'(?<=[.!?])\\s+(?=[A-Z])', content.strip())\n",
    "\n",
    "        sentences = [s.replace(\"<DOT>\", \".\") for s in sentences]\n",
    "     \n",
    "        # Count the paragraphs by finding occurrences of '\\n' at the start of a line\n",
    "        paragraphs =re.split(r'^\\n', content, flags=re.MULTILINE|re.UNICODE)\n",
    "        paragraphs_count = len(paragraphs)\n",
    "\n",
    "        # split paragraph into sentences\n",
    "        words = content.split()\n",
    "    \n",
    "        # get total number of sentences\n",
    "        total_sentences = len(sentences)\n",
    "        \n",
    "        # split each sentence into words\n",
    "        tokenized_sentences = [sentence.split(\" \") for sentence in sentences]\n",
    "\n",
    "        # get total number of words\n",
    "        total_words = len(words)\n",
    "        \n",
    "        # get longest sentence and its length\n",
    "        longest_sen = max(tokenized_sentences, key=len)\n",
    "        longest_sen_len = len(longest_sen)\n",
    "\n",
    "        # get average sentence length\n",
    "        average_sen_len = total_words / total_sentences\n",
    "             \n",
    "        # get shortest sentence and its length\n",
    "        shortest_sen = min(tokenized_sentences, key=len)\n",
    "        shortest_sen_len = len(shortest_sen)\n",
    "\n",
    "        # get words per paragraph\n",
    "        words_per_para = total_words / paragraphs_count\n",
    "\n",
    "        # Collect the results\n",
    "        results.append({\n",
    "            'Filename': filename,\n",
    "            'Total paragraphs': paragraphs_count,\n",
    "            'Total words': total_words, \n",
    "            'Words per paragraph': words_per_para,\n",
    "            'Total sentences': total_sentences,\n",
    "            'Average Sentence Length': average_sen_len,\n",
    "            'Longest Sentence': longest_sen_len,\n",
    "            'Shortest Sentence': shortest_sen_len\n",
    "            \n",
    "        })   \n",
    "\n",
    "    else:\n",
    "\n",
    "        results.append({\n",
    "            'Filename': filename,\n",
    "            'Total words': None, \n",
    "            'Total sentences': None,\n",
    "            'Average Sentence Length': None,\n",
    "            'Shortest Sentence': None,\n",
    "            'Longest Sentence': None,\n",
    "            'Total paragraphs': None,\n",
    "            'Words per paragraph': None\n",
    "        })\n",
    "\n",
    "\n",
    "# Convert and output to DataFrame and CSV\n",
    "df = pd.DataFrame(results)\n",
    "print(df)\n",
    "df.to_csv(f'{base_filename}/{base_filename}_sentence and para_results.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
