{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Intro to Pandas, Reading Data, and Plotting\n", "\n", "The cell below loads up a few libraries and does some initialization. In this notebook we'll do a few basic data manipulations and see the Pandas formatting for the first time and make some simple graphs.\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "### Standard Magic and startup initializers.\n", "\n", "# Load Numpy\n", "import numpy as np\n", "# Load MatPlotLib\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "# Load Pandas\n", "import pandas as pd\n", "\n", "# This lets us show plots inline and also save PDF plots if we want them\n", "%matplotlib inline\n", "from matplotlib.backends.backend_pdf import PdfPages\n", "matplotlib.style.use('fivethirtyeight')\n", "# Seaborn is a plotting package for Pandas that we'll try out...\n", "import seaborn as sns\n", "\n", "# Make the fonts a little bigger..\n", "font = {'size' : 24}\n", "matplotlib.rc('font', **font)\n", "matplotlib.rcParams['mathtext.fontset'] = 'cm'\n", "matplotlib.rcParams['pdf.fonttype'] = 42\n", "\n", "# These two things are for Pandas, it widens the notebook and lets us display data easily.\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with some real data and Pandas!\n", "\n", "Opening and reading CSV files is very easy with Pandas [Read CSV Documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Open the NBA Salaries file.\n", "\n", "df_nba = pd.read_csv(\"./nba_salaries.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
0Paul MillsapPFAtlanta Hawks18.671659
1Al HorfordCAtlanta Hawks12.000000
2Tiago SplitterCAtlanta Hawks9.756250
3Jeff TeaguePGAtlanta Hawks8.000000
4Kyle KorverSGAtlanta Hawks5.746479
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "0 Paul Millsap PF Atlanta Hawks 18.671659\n", "1 Al Horford C Atlanta Hawks 12.000000\n", "2 Tiago Splitter C Atlanta Hawks 9.756250\n", "3 Jeff Teague PG Atlanta Hawks 8.000000\n", "4 Kyle Korver SG Atlanta Hawks 5.746479" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Display gives us a basic table. Note that we can index and slice this in many different ways.\n", "display(df_nba[:5])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
121Stephen CurryPGGolden State Warriors11.370786
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "121 Stephen Curry PG Golden State Warriors 11.370786" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at a specific person... a little clunky.\n", "df_nba.loc[df_nba['PLAYER'] == \"Stephen Curry\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
3Jeff TeaguePGAtlanta Hawks8.000000
8Dennis SchroderPGAtlanta Hawks1.763400
14Avery BradleyPGBoston Celtics7.730337
15Isaiah ThomasPGBoston Celtics6.912869
18Marcus SmartPGBoston Celtics3.431040
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "3 Jeff Teague PG Atlanta Hawks 8.000000\n", "8 Dennis Schroder PG Atlanta Hawks 1.763400\n", "14 Avery Bradley PG Boston Celtics 7.730337\n", "15 Isaiah Thomas PG Boston Celtics 6.912869\n", "18 Marcus Smart PG Boston Celtics 3.431040" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Can filter for a whole set\n", "df_nba.loc[df_nba['POSITION'] == \"PG\"][:5]\n", "\n", "# Again note that we can slice this different ways.." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
239Eric GordonSGNew Orleans Pelicans15.514031
240Tyreke EvansSGNew Orleans Pelicans10.734586
241Jrue HolidayPGNew Orleans Pelicans10.595507
242Omer AsikCNew Orleans Pelicans9.213483
243Ryan AndersonPFNew Orleans Pelicans8.500000
244Anthony DavisPFNew Orleans Pelicans7.070730
245Alexis AjincaCNew Orleans Pelicans4.389607
246Quincy PondexterSFNew Orleans Pelicans3.382023
247Norris ColePGNew Orleans Pelicans3.036927
248Dante CunninghamPFNew Orleans Pelicans2.850000
249Alonzo GeeSFNew Orleans Pelicans1.320000
250Toney DouglasPGNew Orleans Pelicans1.164858
251Luke BabbittSFNew Orleans Pelicans1.100602
252Kendrick PerkinsCNew Orleans Pelicans0.947276
253Bo McCalebbPGNew Orleans Pelicans0.525093
254Bryce Dejean-JonesSGNew Orleans Pelicans0.169883
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "239 Eric Gordon SG New Orleans Pelicans 15.514031\n", "240 Tyreke Evans SG New Orleans Pelicans 10.734586\n", "241 Jrue Holiday PG New Orleans Pelicans 10.595507\n", "242 Omer Asik C New Orleans Pelicans 9.213483\n", "243 Ryan Anderson PF New Orleans Pelicans 8.500000\n", "244 Anthony Davis PF New Orleans Pelicans 7.070730\n", "245 Alexis Ajinca C New Orleans Pelicans 4.389607\n", "246 Quincy Pondexter SF New Orleans Pelicans 3.382023\n", "247 Norris Cole PG New Orleans Pelicans 3.036927\n", "248 Dante Cunningham PF New Orleans Pelicans 2.850000\n", "249 Alonzo Gee SF New Orleans Pelicans 1.320000\n", "250 Toney Douglas PG New Orleans Pelicans 1.164858\n", "251 Luke Babbitt SF New Orleans Pelicans 1.100602\n", "252 Kendrick Perkins C New Orleans Pelicans 0.947276\n", "253 Bo McCalebb PG New Orleans Pelicans 0.525093\n", "254 Bryce Dejean-Jones SG New Orleans Pelicans 0.169883" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Can also see a team...\n", "df_nba.loc[df_nba['TEAM'] == \"New Orleans Pelicans\"]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
156Chris PaulPGLos Angeles Clippers21.468695
60Derrick RosePGChicago Bulls20.093064
269Russell WestbrookPGOklahoma City Thunder16.744218
74Kyrie IrvingPGCleveland Cavaliers16.407501
400John WallPGWashington Wizards15.851950
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "156 Chris Paul PG Los Angeles Clippers 21.468695\n", "60 Derrick Rose PG Chicago Bulls 20.093064\n", "269 Russell Westbrook PG Oklahoma City Thunder 16.744218\n", "74 Kyrie Irving PG Cleveland Cavaliers 16.407501\n", "400 John Wall PG Washington Wizards 15.851950" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at all PGs sorted by salary...\n", "df_nba.loc[df_nba['POSITION'] == \"PG\"].sort_values(\"'15-'16 SALARY\", ascending=False)[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that the above sorting does not happen *in place* unless we explicitly tell Pandas to do so -- [Documentation for sort_values](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PLAYERPOSITIONTEAM'15-'16 SALARY
3Jeff TeaguePGAtlanta Hawks8.000000
8Dennis SchroderPGAtlanta Hawks1.763400
14Avery BradleyPGBoston Celtics7.730337
15Isaiah ThomasPGBoston Celtics6.912869
18Marcus SmartPGBoston Celtics3.431040
\n", "
" ], "text/plain": [ " PLAYER POSITION TEAM '15-'16 SALARY\n", "3 Jeff Teague PG Atlanta Hawks 8.000000\n", "8 Dennis Schroder PG Atlanta Hawks 1.763400\n", "14 Avery Bradley PG Boston Celtics 7.730337\n", "15 Isaiah Thomas PG Boston Celtics 6.912869\n", "18 Marcus Smart PG Boston Celtics 3.431040" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Once we sort by values it does not stay the same unless we overwrite the table or do it in place...\n", "df_nba.loc[df_nba['POSITION'] == \"PG\"][:5]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Maybe see what the position distribution is...\n", "sns.countplot(df_nba['POSITION'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## We can also use Pandas to read a CSV that is online... \n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# We can also read directly from a google sheet if we want. Note that at the end we have to add `/export?gid=1081980213&format=csv`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# The gid field tell us what sheet to load and the format gives us csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mdf_class_survey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"https://docs.google.com/spreadsheets/d/1d4C9HEIOkL7x_W4rYCRsflt_Mw7I6DGLUbAAUwwIqUM/export?gid=1081980213&format=csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 695\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 697\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 698\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0mcompression\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_infer_compression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(\n\u001b[0;32m--> 408\u001b[0;31m filepath_or_buffer, encoding, compression)\n\u001b[0m\u001b[1;32m 409\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'compression'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_filepath_or_buffer\u001b[0;34m(filepath_or_buffer, encoding, compression, mode)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 202\u001b[0;31m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_urlopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 203\u001b[0m \u001b[0mcontent_encoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Content-Encoding'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcontent_encoding\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'gzip'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mopener\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 222\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 223\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0mreq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 524\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 525\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 526\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[0;31m# post-process response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36m_open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 541\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 542\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[0;32m--> 543\u001b[0;31m '_open', req)\n\u001b[0m\u001b[1;32m 544\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36mhttps_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mhttps_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1359\u001b[0m return self.do_open(http.client.HTTPSConnection, req,\n\u001b[0;32m-> 1360\u001b[0;31m context=self._context, check_hostname=self._check_hostname)\n\u001b[0m\u001b[1;32m 1361\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1362\u001b[0m \u001b[0mhttps_request\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/urllib/request.py\u001b[0m in \u001b[0;36mdo_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1315\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1316\u001b[0m h.request(req.get_method(), req.selector, req.data, headers,\n\u001b[0;32m-> 1317\u001b[0;31m encode_chunked=req.has_header('Transfer-encoding'))\n\u001b[0m\u001b[1;32m 1318\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# timeout error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1319\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1227\u001b[0m encode_chunked=False):\n\u001b[1;32m 1228\u001b[0m \u001b[0;34m\"\"\"Send a complete request to the server.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1229\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1230\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1231\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, method, url, body, headers, encode_chunked)\u001b[0m\n\u001b[1;32m 1273\u001b[0m \u001b[0;31m# default charset of iso-8859-1.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0mbody\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'body'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1275\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendheaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mendheaders\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1222\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1223\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mCannotSendHeader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1224\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_output\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage_body\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencode_chunked\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencode_chunked\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1226\u001b[0m def request(self, method, url, body=None, headers={}, *,\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36m_send_output\u001b[0;34m(self, message_body, encode_chunked)\u001b[0m\n\u001b[1;32m 1014\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mb\"\\r\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1015\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_buffer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1016\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1017\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1018\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmessage_body\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 954\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 955\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_open\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 956\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 957\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 958\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mNotConnected\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;34m\"Connect to a host on a given (SSL) port.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1383\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1384\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tunnel_host\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/http/client.py\u001b[0m in \u001b[0;36mconnect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 926\u001b[0m \u001b[0;34m\"\"\"Connect to the host and port specified in __init__.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 927\u001b[0m self.sock = self._create_connection(\n\u001b[0;32m--> 928\u001b[0;31m (self.host,self.port), self.timeout, self.source_address)\n\u001b[0m\u001b[1;32m 929\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetsockopt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIPPROTO_TCP\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTCP_NODELAY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/Cellar/python/3.7.2_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/socket.py\u001b[0m in \u001b[0;36mcreate_connection\u001b[0;34m(address, timeout, source_address)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msource_address\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource_address\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 716\u001b[0;31m \u001b[0msock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msa\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 717\u001b[0m \u001b[0;31m# Break explicitly a reference cycle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[0merr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "#df_class_survey = pd.read_csv(\"./Data Science Day 1 Questions (Responses) - Form Responses 1.csv\")\n", "\n", "# We can also read directly from a google sheet if we want. Note that at the end we have to add `/export?gid=1081980213&format=csv`\n", "# The gid field tell us what sheet to load and the format gives us csv\n", "df_class_survey = pd.read_csv(\"https://docs.google.com/spreadsheets/d/1d4C9HEIOkL7x_W4rYCRsflt_Mw7I6DGLUbAAUwwIqUM/export?gid=1081980213&format=csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_class_survey[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Maybe see what the position distribution is...\n", "sns.countplot(df_class_survey['I use Jupyter Notebooks'])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Maybe see what the position distribution is...\n", "g = sns.countplot(df_class_survey['I use Jupyter Notebooks'])\n", "g.set_xticklabels(g.get_xticklabels(),rotation=-85)\n", "display(g)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# That was fun, let's try to read some books..." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from urllib.request import urlopen \n", "import re\n", "def read_url(url): \n", " return re.sub('\\\\s+', ' ', urlopen(url).read().decode())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "\n", "# Read two books, fast!\n", "\n", "huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'\n", "huck_finn_text = read_url(huck_finn_url)\n", "huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:]\n", "\n", "little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'\n", "little_women_text = read_url(little_women_url)\n", "little_women_chapters = little_women_text.split('CHAPTER ')[1:]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "'III. WELL, I got a good going-over in the morning from old Miss Watson on account of my clothes; but the widow she didn\\'t scold, but only cleaned off the grease and clay, and looked so sorry that I thought I would behave awhile if I could. Then Miss Watson she took me in the closet and prayed, but nothing come of it. She told me to pray every day, and whatever I asked for I would get it. But it warn\\'t so. I tried it. Once I got a fish-line, but no hooks. It warn\\'t any good to me without hooks. I tried for the hooks three or four times, but somehow I couldn\\'t make it work. By and by, one day, I asked Miss Watson to try for me, but she said I was a fool. She never told me why, and I couldn\\'t make it out no way. I set down one time back in the woods, and had a long think about it. I says to myself, if a body can get anything they pray for, why don\\'t Deacon Winn get back the money he lost on pork? Why can\\'t the widow get back her silver snuffbox that was stole? Why can\\'t Miss Watson fat up? No, says I to my self, there ain\\'t nothing in it. I went and told the widow about it, and she said the thing a body could get by praying for it was \"spiritual gifts.\" This was too many for me, but she told me what she meant--I must help other people, and do everything I could for other people, and look out for them all the time, and never think about myself. This was including Miss Watson, as I took it. I went out in the woods and turned it over in my mind a long time, but I couldn\\'t see no advantage about it--except for the other people; so at last I reckoned I wouldn\\'t worry about it any more, but just let it go. Sometimes the widow would take me one side and talk about Providence in a way to make a body\\'s mouth water; but maybe next day Miss Watson would take hold and knock it all down again. I judged I could see that there was two Providences, and a poor chap would stand considerable show with the widow\\'s Providence, but if Miss Watson\\'s got him there warn\\'t no help for him any more. I thought it all out, and reckoned I would belong to the widow\\'s if he wanted me, though I couldn\\'t make out how he was a-going to be any better off then than what he was before, seeing I was so ignorant, and so kind of low-down and ornery. Pap he hadn\\'t been seen for more than a year, and that was comfortable for me; I didn\\'t want to see him no more. He used to always whale me when he was sober and could get his hands on me; though I used to take to the woods most of the time when he was around. Well, about this time he was found in the river drownded, about twelve mile above town, so people said. They judged it was him, anyway; said this drownded man was just his size, and was ragged, and had uncommon long hair, which was all like pap; but they couldn\\'t make nothing out of the face, because it had been in the water so long it warn\\'t much like a face at all. They said he was floating on his back in the water. They took him and buried him on the bank. But I warn\\'t comfortable long, because I happened to think of something. I knowed mighty well that a drownded man don\\'t float on his back, but on his face. So I knowed, then, that this warn\\'t pap, but a woman dressed up in a man\\'s clothes. So I was uncomfortable again. I judged the old man would turn up again by and by, though I wished he wouldn\\'t. We played robber now and then about a month, and then I resigned. All the boys did. We hadn\\'t robbed nobody, hadn\\'t killed any people, but only just pretended. We used to hop out of the woods and go charging down on hog-drivers and women in carts taking garden stuff to market, but we never hived any of them. Tom Sawyer called the hogs \"ingots,\" and he called the turnips and stuff \"julery,\" and we would go to the cave and powwow over what we had done, and how many people we had killed and marked. But I couldn\\'t see no profit in it. One time Tom sent a boy to run about town with a blazing stick, which he called a slogan (which was the sign for the Gang to get together), and then he said he had got secret news by his spies that next day a whole parcel of Spanish merchants and rich A-rabs was going to camp in Cave Hollow with two hundred elephants, and six hundred camels, and over a thousand \"sumter\" mules, all loaded down with di\\'monds, and they didn\\'t have only a guard of four hundred soldiers, and so we would lay in ambuscade, as he called it, and kill the lot and scoop the things. He said we must slick up our swords and guns, and get ready. He never could go after even a turnip-cart but he must have the swords and guns all scoured up for it, though they was only lath and broomsticks, and you might scour at them till you rotted, and then they warn\\'t worth a mouthful of ashes more than what they was before. I didn\\'t believe we could lick such a crowd of Spaniards and A-rabs, but I wanted to see the camels and elephants, so I was on hand next day, Saturday, in the ambuscade; and when we got the word we rushed out of the woods and down the hill. But there warn\\'t no Spaniards and A-rabs, and there warn\\'t no camels nor no elephants. It warn\\'t anything but a Sunday-school picnic, and only a primer-class at that. We busted it up, and chased the children up the hollow; but we never got anything but some doughnuts and jam, though Ben Rogers got a rag doll, and Jo Harper got a hymn-book and a tract; and then the teacher charged in, and made us drop everything and cut. I didn\\'t see no di\\'monds, and I told Tom Sawyer so. He said there was loads of them there, anyway; and he said there was A-rabs there, too, and elephants and things. I said, why couldn\\'t we see them, then? He said if I warn\\'t so ignorant, but had read a book called Don Quixote, I would know without asking. He said it was all done by enchantment. He said there was hundreds of soldiers there, and elephants and treasure, and so on, but we had enemies which he called magicians; and they had turned the whole thing into an infant Sunday-school, just out of spite. I said, all right; then the thing for us to do was to go for the magicians. Tom Sawyer said I was a numskull. \"Why,\" said he, \"a magician could call up a lot of genies, and they would hash you up like nothing before you could say Jack Robinson. They are as tall as a tree and as big around as a church.\" \"Well,\" I says, \"s\\'pose we got some genies to help _us_--can\\'t we lick the other crowd then?\" \"How you going to get them?\" \"I don\\'t know. How do _they_ get them?\" \"Why, they rub an old tin lamp or an iron ring, and then the genies come tearing in, with the thunder and lightning a-ripping around and the smoke a-rolling, and everything they\\'re told to do they up and do it. They don\\'t think nothing of pulling a shot-tower up by the roots, and belting a Sunday-school superintendent over the head with it--or any other man.\" \"Who makes them tear around so?\" \"Why, whoever rubs the lamp or the ring. They belong to whoever rubs the lamp or the ring, and they\\'ve got to do whatever he says. If he tells them to build a palace forty miles long out of di\\'monds, and fill it full of chewing-gum, or whatever you want, and fetch an emperor\\'s daughter from China for you to marry, they\\'ve got to do it--and they\\'ve got to do it before sun-up next morning, too. And more: they\\'ve got to waltz that palace around over the country wherever you want it, you understand.\" \"Well,\" says I, \"I think they are a pack of flat-heads for not keeping the palace themselves \\'stead of fooling them away like that. And what\\'s more--if I was one of them I would see a man in Jericho before I would drop my business and come to him for the rubbing of an old tin lamp.\" \"How you talk, Huck Finn. Why, you\\'d _have_ to come when he rubbed it, whether you wanted to or not.\" \"What! and I as high as a tree and as big as a church? All right, then; I _would_ come; but I lay I\\'d make that man climb the highest tree there was in the country.\" \"Shucks, it ain\\'t no use to talk to you, Huck Finn. You don\\'t seem to know anything, somehow--perfect saphead.\" I thought all this over for two or three days, and then I reckoned I would see if there was anything in it. I got an old tin lamp and an iron ring, and went out in the woods and rubbed and rubbed till I sweat like an Injun, calculating to build a palace and sell it; but it warn\\'t no use, none of the genies come. So then I judged that all that stuff was only just one of Tom Sawyer\\'s lies. I reckoned he believed in the A-rabs and the elephants, but as for me I think different. It had all the marks of a Sunday-school. '" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "huck_finn_chapters[2]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Turn it into a data frame..\n", "df_huck = pd.DataFrame(huck_finn_chapters, columns=[\"Text\"])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0I. YOU don't know about me without you have re...
1II. WE went tiptoeing along a path amongst the...
2III. WELL, I got a good going-over in the morn...
3IV. WELL, three or four months run along, and ...
4V. I had shut the door to. Then I turned aroun...
\n", "
" ], "text/plain": [ " Text\n", "0 I. YOU don't know about me without you have re...\n", "1 II. WE went tiptoeing along a path amongst the...\n", "2 III. WELL, I got a good going-over in the morn...\n", "3 IV. WELL, three or four months run along, and ...\n", "4 V. I had shut the door to. Then I turned aroun..." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(df_huck[:5])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Count how many times we see each character...\n", "# Here we make a data frame out of a dictionary where the index is the column name\n", "# and the values are the column\n", "counts = pd.DataFrame({\n", " 'Jim':np.char.count(huck_finn_chapters, 'Jim'),\n", " 'Tom':np.char.count(huck_finn_chapters, 'Tom'),\n", " 'Huck':np.char.count(huck_finn_chapters, 'Huck')\n", " })" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
JimTomHuck
0063
116242
2052
3801
4000
\n", "
" ], "text/plain": [ " Jim Tom Huck\n", "0 0 6 3\n", "1 16 24 2\n", "2 0 5 2\n", "3 8 0 1\n", "4 0 0 0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "counts[:5]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ax = counts.cumsum().plot(figsize=(10,8))\n", "ax.set_xlabel(\"Chapter\")\n", "ax.set_ylabel(\"Number of Times\")\n", "#ax.set_ylim((-5,310))\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are lots of options for the figures ... Note that here we are using [Pandas Plot](https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.plot.html) which is a wrapper around [MatPlot's Plot](https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.plot.html)." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ax = counts.cumsum().plot(figsize=(10,8), fontsize=(5),\n", " lw=2, \n", " markersize=12,\n", " style=['X-','o-.','v--','s:','d:','*-.'])\n", "ax.set_xlabel(\"Chapter\")\n", "ax.set_ylabel(\"Number of Times\")\n", "ax.set_ylim((-5,310))\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# Now for Little women...\n", "\n", "people = ['Amy', 'Beth', 'Jo', 'Laurie', 'Meg']\n", "people_counts = {pp: np.char.count(little_women_chapters, pp) for pp in people}\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Amy', 'Beth', 'Jo', 'Laurie', 'Meg'])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "people_counts.keys()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([26, 12, 2, 18, 14, 28, 5, 9, 5, 5, 20, 20, 13, 5, 12, 9, 29,\n", " 30, 7, 7, 1, 15, 6, 9, 6, 5, 4, 0, 1, 5, 3, 33, 6, 2,\n", " 3, 37, 1, 0, 1, 21, 7, 9, 8, 0, 2, 1, 3])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "people_counts['Beth']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AmyBethJoLaurieMeg
0232644026
1131221020
222621636
3141834017
4614553513
\n", "
" ], "text/plain": [ " Amy Beth Jo Laurie Meg\n", "0 23 26 44 0 26\n", "1 13 12 21 0 20\n", "2 2 2 62 16 36\n", "3 14 18 34 0 17\n", "4 6 14 55 35 13" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a pandas table...\n", "counts = pd.DataFrame(people_counts)\n", "counts[:5]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ax = counts.cumsum().plot(figsize=(10,8), fontsize=(15),\n", " lw=2, \n", " markersize=12,\n", " style=['X-','o-.','v--','s:','d:','*-.'])\n", "ax.set_xlabel(\"Chapter\")\n", "ax.set_ylabel(\"Number of Times\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Something more fun...\n", "\n", "Inspired by the [Inferential Thinking Book](https://www.inferentialthinking.com/chapters/01/3/2/Another_Kind_Of_Character) let's do some more analysis on the text that we have loaded up.\n", "\n", "First let's count the number of periods and the total number of characters in each of the books." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I. YOU don't know about me without you have read a\n", "ONE PLAYING PILGRIMS \"Christmas won't be Christmas\n" ] } ], "source": [ "# Recall that each element in the array corresponds to a chapter.\n", "print(huck_finn_chapters[0][:50])\n", "print(little_women_chapters[0][:50])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "chars_periods_huck_finn = pd.DataFrame({\n", " 'Huck Finn Chapter Length':[len(s) for s in huck_finn_chapters],\n", " 'Number of Periods':np.char.count(huck_finn_chapters, '.')\n", " })\n", "chars_periods_little_women = pd.DataFrame({\n", " 'Little Women Chapter Length': [len(s) for s in little_women_chapters],\n", " 'Number of Periods': np.char.count(little_women_chapters, '.')\n", " })" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Huck Finn Chapter LengthNumber of Periods
0702666
111982117
2852972
3679984
4816691
\n", "
" ], "text/plain": [ " Huck Finn Chapter Length Number of Periods\n", "0 7026 66\n", "1 11982 117\n", "2 8529 72\n", "3 6799 84\n", "4 8166 91" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Little Women Chapter LengthNumber of Periods
021759189
122148188
220558231
325526195
423395255
\n", "
" ], "text/plain": [ " Little Women Chapter Length Number of Periods\n", "0 21759 189\n", "1 22148 188\n", "2 20558 231\n", "3 25526 195\n", "4 23395 255" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(chars_periods_huck_finn[:5])\n", "display(chars_periods_little_women[:5])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What do we notice about the above? It seems like *Little Women* is significantly longer per chapter than *Huck Finn*. Let's try plotting this relationship on the same graph.\n", "\n", "To do this we are going to use the `scatter` function from [MatPlotLib](https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.scatter.html)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'Number of Periods'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2601\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2602\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2603\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'Number of Periods'", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m plt.scatter(chars_periods_huck_finn[\"Number of Periods\"], \n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mchars_periods_huck_finn\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Huck Finn Chapter Length\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m color='darkblue')\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2915\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2916\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2917\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2918\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2919\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/local/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2602\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2603\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2604\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2605\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2606\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mindexer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'Number of Periods'" ] }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(6, 6))\n", "plt.scatter(chars_periods_huck_finn[\"Number of Periods\"], \n", " chars_periods_huck_finn[\"Huck Finn Chapter Length\"], \n", " color='darkblue')\n", "\n", "plt.scatter(chars_periods_little_women[\"Number of Periods\"], \n", " chars_periods_little_women[\"Little Women Chapter Length\"], \n", " color='gold')\n", "\n", "plt.xlabel('Number of periods in chapter')\n", "plt.ylabel('Number of characters in chapter')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The above plot shows us a few things:\n", "1. Little Women is much longer on average than Huck\n", "2. There seems to be a linear relationship between the number of characters and the number of periods\n", "\n", "If we look at all the chapters that have 100 periods we see they have 10,000 - 15,000 characters.. or roughly 100-150 characters per sentence. Seems like a Tweet." ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
charactersperiods
0702666
111982117
2852972
3679984
4816691
\n", "
" ], "text/plain": [ " characters periods\n", "0 7026 66\n", "1 11982 117\n", "2 8529 72\n", "3 6799 84\n", "4 8166 91" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "43" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's formally find the relationship...\n", "from scipy import stats\n", "\n", "# First let's make the tables the same..\n", "chars_periods_huck_finn.columns = ['characters', 'periods']\n", "chars_periods_little_women.columns = ['characters', 'periods']\n", "display(chars_periods_huck_finn[:5])\n", "len(chars_periods_huck_finn)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
charactersperiods
0702666
111982117
2852972
3679984
4816691
514550125
613218127
722208249
8808171
9703670
1014437168
1114665129
121023483
13762089
141204698
1517027183
1617182145
1724031202
1817259105
1918338122
2019172275
211043170
2211442102
2312041103
2414971107
2514717145
261360899
2718347160
2819038127
29611939
.........
1717432144
1817338121
1914252145
2023545269
2113708120
2223015247
2324239182
241352691
2521739150
2615754109
2730992271
2826254233
2924137218
3020723178
3123158224
3225777232
3332496257
3421310201
3511441100
3623524157
3726091206
3827473263
391136861
4026464187
4116753118
4233202305
431028995
441255896
4527094234
4640935392
\n", "

90 rows × 2 columns

\n", "
" ], "text/plain": [ " characters periods\n", "0 7026 66\n", "1 11982 117\n", "2 8529 72\n", "3 6799 84\n", "4 8166 91\n", "5 14550 125\n", "6 13218 127\n", "7 22208 249\n", "8 8081 71\n", "9 7036 70\n", "10 14437 168\n", "11 14665 129\n", "12 10234 83\n", "13 7620 89\n", "14 12046 98\n", "15 17027 183\n", "16 17182 145\n", "17 24031 202\n", "18 17259 105\n", "19 18338 122\n", "20 19172 275\n", "21 10431 70\n", "22 11442 102\n", "23 12041 103\n", "24 14971 107\n", "25 14717 145\n", "26 13608 99\n", "27 18347 160\n", "28 19038 127\n", "29 6119 39\n", ".. ... ...\n", "17 17432 144\n", "18 17338 121\n", "19 14252 145\n", "20 23545 269\n", "21 13708 120\n", "22 23015 247\n", "23 24239 182\n", "24 13526 91\n", "25 21739 150\n", "26 15754 109\n", "27 30992 271\n", "28 26254 233\n", "29 24137 218\n", "30 20723 178\n", "31 23158 224\n", "32 25777 232\n", "33 32496 257\n", "34 21310 201\n", "35 11441 100\n", "36 23524 157\n", "37 26091 206\n", "38 27473 263\n", "39 11368 61\n", "40 26464 187\n", "41 16753 118\n", "42 33202 305\n", "43 10289 95\n", "44 12558 96\n", "45 27094 234\n", "46 40935 392\n", "\n", "[90 rows x 2 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now we are going to concatinate the data together -- this is our first join operation!\n", "\n", "merged = pd.concat([chars_periods_huck_finn, chars_periods_little_women])\n", "merged" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "90" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(merged)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "slope, intercept, r_value, p_value, std_err = stats.linregress(merged['periods'],merged['characters'])" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "89.0611539393866" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "slope" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3664.067173280624" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "intercept" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9233365944610994" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r_value" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.413147744447107e-38" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "p_value" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.9482862802859104" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "std_err" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "line = slope * merged['periods'] + intercept" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can add the line above to our plot using the [plot function](https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.plot.html)." ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Number of characters in chapter')" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(10, 10))\n", "plt.scatter(chars_periods_huck_finn[\"periods\"], \n", " chars_periods_huck_finn[\"characters\"], \n", " color='darkblue')\n", "\n", "plt.scatter(chars_periods_little_women[\"periods\"], \n", " chars_periods_little_women[\"characters\"], \n", " color='gold')\n", "\n", "plt.plot(merged['periods'], line, lw=1, ls=':')\n", "\n", "plt.xlabel('Number of periods in chapter')\n", "plt.ylabel('Number of characters in chapter')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }