{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "judicial-clearing", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 6, "id": "defensive-liver", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('MediaSearch_20210127.tsv',sep='\\t',header=None)" ] }, { "cell_type": "code", "execution_count": 8, "id": "brave-cylinder", "metadata": {}, "outputs": [], "source": [ "data[1] = data[1].apply(lambda x: int(x[x.find(':')+1:]))\n", "data.iloc[:,2:17] = data.iloc[:,2:17].applymap(lambda x: float(x[x.find(':')+1:]))" ] }, { "cell_type": "code", "execution_count": 70, "id": "unable-shore", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789101112131415161718
7431-180.0000000.000000.0000000.0000000.0000000.0000000.00.00.0000000.017.07251018.3499980.0000000.0000000.0# Patterson's General Store Ledger - DPLA - 00...Miss James
7432-180.0000000.000000.0000000.0000000.0000000.0000000.00.00.0000000.017.07251018.3499980.0000000.0000000.0# Patterson's General Store Ledger - DPLA - 00...Miss James
7433-180.0000000.000005.5876924.94994310.36335910.3633590.00.07.2919620.011.98600012.2896140.0000000.0000000.0# James Meredith's Letter to the Registrar, Un...Miss James
74341815.92740416.8835116.05894915.29192012.41814212.4181420.00.022.1271020.019.74137318.95845417.16188817.9417860.0# Miss James by Camille Silvy (cropped).jpgMiss James
7435080.0000000.0000015.97781416.0528900.0000000.0000000.00.022.9878900.017.30608417.8252700.0000000.0000000.0# Hindhead Tunnel Miss James Bridge.JPGMiss James
7436-180.0000000.000000.0000000.0000000.0000000.0000000.00.00.0000000.019.74619920.0567360.0000000.0000000.0# Herstmonceux Place - geograph.org.uk - 15863...Miss James
7437-1817.79538316.7528015.97652516.07052014.96597714.9659770.00.023.0228420.017.44126515.37114414.03435014.1177580.0# Miss James' Walk NT sign.jpgMiss James
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 \\\n", "7431 -1 8 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 \n", "7432 -1 8 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 \n", "7433 -1 8 0.000000 0.00000 5.587692 4.949943 10.363359 10.363359 \n", "7434 1 8 15.927404 16.88351 16.058949 15.291920 12.418142 12.418142 \n", "7435 0 8 0.000000 0.00000 15.977814 16.052890 0.000000 0.000000 \n", "7436 -1 8 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 \n", "7437 -1 8 17.795383 16.75280 15.976525 16.070520 14.965977 14.965977 \n", "\n", " 8 9 10 11 12 13 14 15 \\\n", "7431 0.0 0.0 0.000000 0.0 17.072510 18.349998 0.000000 0.000000 \n", "7432 0.0 0.0 0.000000 0.0 17.072510 18.349998 0.000000 0.000000 \n", "7433 0.0 0.0 7.291962 0.0 11.986000 12.289614 0.000000 0.000000 \n", "7434 0.0 0.0 22.127102 0.0 19.741373 18.958454 17.161888 17.941786 \n", "7435 0.0 0.0 22.987890 0.0 17.306084 17.825270 0.000000 0.000000 \n", "7436 0.0 0.0 0.000000 0.0 19.746199 20.056736 0.000000 0.000000 \n", "7437 0.0 0.0 23.022842 0.0 17.441265 15.371144 14.034350 14.117758 \n", "\n", " 16 17 18 \n", "7431 0.0 # Patterson's General Store Ledger - DPLA - 00... Miss James \n", "7432 0.0 # Patterson's General Store Ledger - DPLA - 00... Miss James \n", "7433 0.0 # James Meredith's Letter to the Registrar, Un... Miss James \n", "7434 0.0 # Miss James by Camille Silvy (cropped).jpg Miss James \n", "7435 0.0 # Hindhead Tunnel Miss James Bridge.JPG Miss James \n", "7436 0.0 # Herstmonceux Place - geograph.org.uk - 15863... Miss James \n", "7437 0.0 # Miss James' Walk NT sign.jpg Miss James " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data[1] == 8] # select qid" ] }, { "cell_type": "code", "execution_count": 11, "id": "recreational-leonard", "metadata": {}, "outputs": [], "source": [ "feature = data.iloc[:,2:17].copy()\n", "feature.columns = [str(x) for x in range(1, 16)]" ] }, { "cell_type": "code", "execution_count": 54, "id": "martial-luxembourg", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123456789101112131415
count8783.0000008783.0000008783.0000008783.0000008783.0000008783.0000008783.0000008783.0000008783.0000008783.08783.0000008783.0000008783.0000008783.0000008783.000000
mean1.8731181.9196549.7290969.3153738.1790328.1790321.9608761.88269711.6086550.013.28179512.3981023.9871703.7973451.606014
std4.6562794.6514938.0467138.1894608.1417908.1417904.9050014.86231511.1701360.08.8568938.9785997.2098127.1094954.235872
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.0000000.0000000.000000
25%0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.07.3604866.2308770.0000000.0000000.000000
50%0.0000000.0000009.8896089.5371167.4365587.4365580.0000000.00000011.4901620.012.59385711.6312140.0000000.0000000.000000
75%0.0000000.00000014.97032014.61392513.45965713.4596570.0000000.00000017.8130280.018.69549517.7508537.3269076.3764960.000000
max31.27809730.21830243.35006744.52718447.12815547.12815530.32835030.14207873.6894000.054.40340055.52229347.79889045.36058814.294704
\n", "
" ], "text/plain": [ " 1 2 3 4 5 \\\n", "count 8783.000000 8783.000000 8783.000000 8783.000000 8783.000000 \n", "mean 1.873118 1.919654 9.729096 9.315373 8.179032 \n", "std 4.656279 4.651493 8.046713 8.189460 8.141790 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 9.889608 9.537116 7.436558 \n", "75% 0.000000 0.000000 14.970320 14.613925 13.459657 \n", "max 31.278097 30.218302 43.350067 44.527184 47.128155 \n", "\n", " 6 7 8 9 10 \\\n", "count 8783.000000 8783.000000 8783.000000 8783.000000 8783.0 \n", "mean 8.179032 1.960876 1.882697 11.608655 0.0 \n", "std 8.141790 4.905001 4.862315 11.170136 0.0 \n", "min 0.000000 0.000000 0.000000 0.000000 0.0 \n", "25% 0.000000 0.000000 0.000000 0.000000 0.0 \n", "50% 7.436558 0.000000 0.000000 11.490162 0.0 \n", "75% 13.459657 0.000000 0.000000 17.813028 0.0 \n", "max 47.128155 30.328350 30.142078 73.689400 0.0 \n", "\n", " 11 12 13 14 15 \n", "count 8783.000000 8783.000000 8783.000000 8783.000000 8783.000000 \n", "mean 13.281795 12.398102 3.987170 3.797345 1.606014 \n", "std 8.856893 8.978599 7.209812 7.109495 4.235872 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 7.360486 6.230877 0.000000 0.000000 0.000000 \n", "50% 12.593857 11.631214 0.000000 0.000000 0.000000 \n", "75% 18.695495 17.750853 7.326907 6.376496 0.000000 \n", "max 54.403400 55.522293 47.798890 45.360588 14.294704 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature.describe()" ] }, { "cell_type": "code", "execution_count": 101, "id": "different-british", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123456789101112131415
021.59961020.68296616.61670016.7380700.0000000.0000000.0000000.00000023.0662380.018.57168616.56506714.09811014.8341250.000000
10.0000000.00000017.82968117.9550600.0000000.0000000.0000000.00000023.2442550.020.54407519.7959020.0000000.0000000.000000
20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.0000000.00000013.733603
30.0000000.00000013.58403013.6913939.7828379.7828370.0000000.00000021.3102150.017.36810117.4810640.0000000.00000014.262909
40.0000000.00000014.19413014.26631410.03650910.03650914.34054214.65528723.0263000.012.06278012.2862950.0000000.0000000.000000
................................................
87780.0000000.0000000.0000000.0000005.4721885.4721880.0000000.0000000.0000000.010.71478710.6219510.0000000.00000011.901220
87790.0000000.0000008.8625788.5129569.2095099.2095098.9340179.08581913.4066170.011.30706511.2444680.0000000.0000000.000000
87800.0000000.0000000.0000000.0000007.2522797.2522790.0000000.0000000.0000000.010.1667059.7031000.0000000.0000000.000000
878111.45362110.80798212.64771512.3279560.0000000.0000000.0000000.00000014.8472950.011.63413911.38257513.58971213.6593420.000000
87820.0000000.0000000.0000000.0000008.4493778.4493770.0000000.0000000.0000000.010.39974210.2227920.0000000.0000000.000000
\n", "

8783 rows × 15 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 \\\n", "0 21.599610 20.682966 16.616700 16.738070 0.000000 0.000000 \n", "1 0.000000 0.000000 17.829681 17.955060 0.000000 0.000000 \n", "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "3 0.000000 0.000000 13.584030 13.691393 9.782837 9.782837 \n", "4 0.000000 0.000000 14.194130 14.266314 10.036509 10.036509 \n", "... ... ... ... ... ... ... \n", "8778 0.000000 0.000000 0.000000 0.000000 5.472188 5.472188 \n", "8779 0.000000 0.000000 8.862578 8.512956 9.209509 9.209509 \n", "8780 0.000000 0.000000 0.000000 0.000000 7.252279 7.252279 \n", "8781 11.453621 10.807982 12.647715 12.327956 0.000000 0.000000 \n", "8782 0.000000 0.000000 0.000000 0.000000 8.449377 8.449377 \n", "\n", " 7 8 9 10 11 12 13 \\\n", "0 0.000000 0.000000 23.066238 0.0 18.571686 16.565067 14.098110 \n", "1 0.000000 0.000000 23.244255 0.0 20.544075 19.795902 0.000000 \n", "2 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 \n", "3 0.000000 0.000000 21.310215 0.0 17.368101 17.481064 0.000000 \n", "4 14.340542 14.655287 23.026300 0.0 12.062780 12.286295 0.000000 \n", "... ... ... ... ... ... ... ... \n", "8778 0.000000 0.000000 0.000000 0.0 10.714787 10.621951 0.000000 \n", "8779 8.934017 9.085819 13.406617 0.0 11.307065 11.244468 0.000000 \n", "8780 0.000000 0.000000 0.000000 0.0 10.166705 9.703100 0.000000 \n", "8781 0.000000 0.000000 14.847295 0.0 11.634139 11.382575 13.589712 \n", "8782 0.000000 0.000000 0.000000 0.0 10.399742 10.222792 0.000000 \n", "\n", " 14 15 \n", "0 14.834125 0.000000 \n", "1 0.000000 0.000000 \n", "2 0.000000 13.733603 \n", "3 0.000000 14.262909 \n", "4 0.000000 0.000000 \n", "... ... ... \n", "8778 0.000000 11.901220 \n", "8779 0.000000 0.000000 \n", "8780 0.000000 0.000000 \n", "8781 13.659342 0.000000 \n", "8782 0.000000 0.000000 \n", "\n", "[8783 rows x 15 columns]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature" ] }, { "cell_type": "code", "execution_count": 12, "id": "electronic-provider", "metadata": {}, "outputs": [], "source": [ "rating = data[0].copy()" ] }, { "cell_type": "code", "execution_count": 16, "id": "chicken-munich", "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEGCAYAAAB1iW6ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAcFklEQVR4nO3de3RU9b338ffXgHIJRrw0KqFcPB4r5RJCBJSDQqmKl2Kp9DlSVOJTZNk+iEepz6IPLquULm9V8XbwcqxSa0XkKIcqCtUSFaseLiYg4AUBlYigeONSVML3+WM2OUmYkEmyN2R+fF5rZTmzZ89nfm72fLKzZ+895u6IiEj2O2h/D0BEROKhQhcRCYQKXUQkECp0EZFAqNBFRALRYn+98JFHHumdO3fOaN5t27bRtm3bRMaRVHa25SaZrdzks7MtN8nsbMttaPaSJUs+dfej0j7o7vvlp0+fPp6pBQsWZDxvQyWVnW25SWYrN/nsbMtNMjvbchuaDSz2OnpVu1xERAKhQhcRCYQKXUQkEPvtQ9F0vv32W9avX8+OHTtqTM/Ly2PVqlWJvGZS2dmWm2R2krlr166loKCAli1bxp4vkm2aVaGvX7+edu3a0blzZ8ysavqWLVto165dIq+ZVHa25SaZnVTuV199xTfffMP69evp0qVL7Pki2aZZ7XLZsWMHRxxxRI0yF6mLmXHEEUfs8RedyIGqWRU6oDKXBtH6IvI/ml2hi4hI4zSrfei1dZ74TKx56248J7asqVOnMnbsWNq0adPkrLfeeosLLrgAM2PWrFkcd9xxMYxQRA40zbrQ96eqM68OSv9HzNSpU7nwwgtjKfTZs2czYsQIrrnmmiZnJaGyspKcnJx9+po7d+6kRQutnvtSug2oCT12UpJmepwbRxIf7XKpZt26dZxwwglcfPHFdO/enQ8//JBf/OIXFBcX8/3vf5/f/OY3ANx555189NFHDB48mMGDBwMwf/58Tj75ZIqKivjpT3/K1q1b98gvKyujf//+9OzZk+HDh/P5558zd+5cpk6dyrRp06qyqsvNzWXSpEn06tWL/v37s2nTpqqx/uAHP6Bnz54MGTKEDz74AICSkhLGjx/PKaecQteuXZk1axYA1157LYWFhRQWFtKhQwcuueQSAP70pz/Rt29fCgsLueKKK6isrKx63QkTJtCrVy9effVVbrvtNrp370737t2ZOnXqHuOsrKykpKSE7t2706NHD26//XYAVq9ezbBhw+jVqxdFRUW89957uDtXX3111byPP/44AKWlpQwcOJBhw4bRrVs3KisrufrqqznppJPo2bMn9913X6P/bUUOBCr0Wt59911++ctfsmLFCjp16sTvfvc7Fi9ezLJly3jxxRdZtmwZ48eP59hjj2XBggUsWLCATz/9lClTpvD888+zdOlSiouLufvuu/fIvvjii7nppptYtmwZPXr04Prrr+fss8/msssu48orr2TBggV7PGfbtm3079+f8vJyTj31VB5++GEALr/8ckaPHs2yZcsYNWoU48ePr3rOhg0bWLhwIU8//TQTJ04EYPLkyZSVlVFaWsrhhx/OuHHjWLVqFY8//jivvPIKZWVlHHTQQTz66KNVr9uvXz/Ky8tp3bo1Dz30EK+//jqvvfYaDzzwAG+88UaNcZaVlVFRUcGbb77J8uXLq35hjBo1iksvvZTy8nL+/ve/c8wxx/Dkk09SVlZGeXk5zz//PFdffTUbNmwAYOnSpdxxxx288847PPjgg+Tl5bFo0SIWLVrEAw88wNq1a5v+jywSKBV6LZ06daJ///5V92fOnElRURG9e/dmxYoVrFy5co/nvPbaa6xcuZIBAwZQWFjI9OnT+fDDD2vM8+WXX/LFF19w2mmnATB69Gheeumlesdz8MEHc+655wLQp0+fqi3xV199lZ/97GcAXHTRRSxcuLDqOT/+8Y856KCD6NatGxs3bqya7u5ceOGFXHXVVfTp04cXXniBJUuWcNJJJ1FYWMiLL77ImjVrAMjJyeH8888HYOHChQwfPpy2bduSm5vLT37yE15++eUa4+zatStr1qzh8ssv57nnnuPQQw9ly5YtVFRU8KMf/QiAVq1a0aZNGxYuXMjIkSPJyckhPz+f0047jUWLFgHQt2/fqmPK58+fzx//+EcKCwvp168fmzdv5t133613mYkcqLSTspbql7Bcu3Ytv//971m0aBHt27enpKQk7THP7s7pp5/OY489VjVty5YtsYynZcuWVYfm5eTksHPnznqfc8ghh9QY227XXXcdBQUFVVvP7s7o0aO54YYbqsa8+wSgVq1aNWi/efv27SkvL2fevHnce++9zJw5kzvuuCPj5+9Wffm7O3fddRdnnnlmg3NEDkTaQt+Lr776irZt25KXl8fGjRt59tlnqx5r165dVWn379+fV155hdWrVwOp3RW1tyTz8vJo37591ZbtI488UrW13hinnHIKM2bMAODRRx9l4MCBe53/L3/5C88//zx33nln1bQhQ4Ywa9asqv3yn332Ge+///4ezx04cCCzZ89m+/btbNu2jaeeemqP1/v000/ZtWsX559/PlOmTGHp0qW0a9eOgoICnn76aQC+/vprtm/fzsCBA3n88ceprKzkk08+4aWXXqJv3757vO6ZZ57JtGnT+PbbbwF455132LZtWwOWksiBpVlvoe/+JD3J0933plevXvTu3Zvvfe97dOzYkQEDBlQ9NnbsWIYOHVq1L/3hhx9m5MiRfP311wBMmjSJoqKiGnnTp0/nsssuY/v27XTt2pWHHnqo0WO76667uOSSS7jllls46qij6s267bbbqKioqCrOYcOGMXnyZKZMmcIZZ5zBrl27yMnJYdq0aXTq1KnGc4uKiigpKal67pgxY+jdu3eNeSoqKrjkkkvYtWsXQNVW/yOPPMKYMWO44YYbaNmyJU888QTDhw/n1VdfpVevXpgZN998M0cffTRvvfVWjcwxY8awbt06ioqKcHeOOuooZs+e3ehlJhI6q/4n+b5UXFzsixcvrjFt1apVnHjiiXvMq+uXJJ+bZHbSuXWtN41VWlrKoEGDYsvbF9lx5NZ12OKty/fc7ovjsMXmvCz2ZW5Ds81sibsXp3tMu1xERALRrHe5iEj2q+uM73QnLemEpaZpdlvo+2sXkGQnrS8i/6NZFXqrVq3YvHmz3qSSEXdn8+bNtGrVan8PRaRZaFa7XAoKCli/fj2ffPJJjek7duxI7E2bVHa25SaZnWTuYYcdRkFBQezZItmoWRV6y5Yt037zTGlp6R6HycUlqexsy00yO9tyRbJVs9rlIiIijZdRoZvZUDN728xWm9nENI9/18wWmNkbZrbMzM6Of6giIrI39Ra6meUA9wBnAd2AkWbWrdZs1wAz3b03cAHw73EPVERE9i6TLfS+wGp3X+Pu3wAzgPNqzePAodHtPOCj+IYoIiKZqPfUfzMbAQx19zHR/YuAfu4+rto8xwDzgfZAW+CH7r4kTdZYYCxAfn5+n90Xl6rP1q1byc3NzWjehkoqO9tyk8xWbvLZceQur/hyj2n5rWHjP/act0eHvCbl1pXdkNy6NOdlHEf24MGD6zz1P66jXEYCD7v7rWZ2MvCImXV3913VZ3L3+4H7IXUtl0yvXdBcrqEQcm6S2cpNPjuO3HRfNVfntVxGZf5a6XLrym5Ibl2a8zJOOjuTXS4VQMdq9wuiadX9HJgJ4O6vAq2AI5s8OhERyVgmhb4ION7MupjZwaQ+9JxTa54PgCEAZnYiqUL/BBER2WfqLXR33wmMA+YBq0gdzbLCzCab2bBotgnApWZWDjwGlLjO3xcR2acy2ofu7nOBubWmXVvt9kpgQO3niYjIvqMzRUVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJREZfQSd16zzxmbTTJ/TYSUmtx9bdeM6+GJKIHKC0hS4iEggVuohIIFToIiKBUKGLiARChS4iEggVuohIIFToIiKB0HHozVi6Y9zTHd8OOsZdRLSFLiISDBW6iEggVOgiIoFQoYuIBEKFLiISCBW6iEggdNjiASipS/7qUsIi+5cKXSQB+uUm+4N2uYiIBEKFLiISCBW6iEggMip0MxtqZm+b2Wozm1jHPP/LzFaa2Qoz+3O8wxQRkfrU+6GomeUA9wCnA+uBRWY2x91XVpvneODXwAB3/9zMvpPUgEVEJL1MttD7AqvdfY27fwPMAM6rNc+lwD3u/jmAu2+Kd5giIlIfc/e9z2A2Ahjq7mOi+xcB/dx9XLV5ZgPvAAOAHOA6d38uTdZYYCxAfn5+nxkzZmQ0yK1bt5Kbm5vRvA3V1OzlFV+mnZ7fGjb+o+a0Hh3ympydLreh2UmNOcllkU5S60Ucudm4LLJtfatLc14v4sgePHjwEncvTvdYXMehtwCOBwYBBcBLZtbD3b+oPpO73w/cD1BcXOyDBg3KKLy0tJRM522opmanuzY5pI43vnV5zcW7blTDXidddrrchmYnNeYkl0U6Sa0XceRm47LItvWtLs15vUg6O5NdLhVAx2r3C6Jp1a0H5rj7t+6+ltTW+vFNHp2IiGQsk0JfBBxvZl3M7GDgAmBOrXlmk9o6x8yOBP4ZWBPfMEVEpD71Frq77wTGAfOAVcBMd19hZpPNbFg02zxgs5mtBBYAV7v75qQGLSIie8poH7q7zwXm1pp2bbXbDlwV/YiIyH6gM0VFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCURGhW5mQ83sbTNbbWYT9zLf+WbmZlYc3xBFRCQT9Ra6meUA9wBnAd2AkWbWLc187YArgNfjHqSIiNQvky30vsBqd1/j7t8AM4Dz0sz3W+AmYEeM4xMRkQxlUugdgA+r3V8fTatiZkVAR3d/JsaxiYhIA5i7730GsxHAUHcfE92/COjn7uOi+wcBfwNK3H2dmZUCv3L3xWmyxgJjAfLz8/vMmDEjo0Fu3bqV3NzcjP+nGqKp2csrvkw7Pb81bPxHzWk9OuQ1OTtdbkOzkxpzkssinaTWizhys3FZZNv6VpfmvF7EkT148OAl7p72c8pMCv1k4Dp3PzO6/2sAd78hup8HvAdsjZ5yNPAZMCxdqe9WXFzsixfX+XANpaWlDBo0KKN5G6qp2Z0npv+jZEKPndy6vEWNaetuPKfJ2elyG5qd1JiTXBbpJLVexJGbjcsi29a3ujTn9SKObDOrs9Az2eWyCDjezLqY2cHABcCc3Q+6+5fufqS7d3b3zsBr1FPmIiISv3oL3d13AuOAecAqYKa7rzCzyWY2LOkBiohIZvb8WyoNd58LzK017do65h3U9GGJiEhD6UxREZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCYQKXUQkECp0EZFAqNBFRAKhQhcRCURG3ykqItLcdJ74TNrpE3rspCTNY+tuPCfpIe13KnQRkVrS/bLIhl8U2uUiIhIIFbqISCBU6CIigVChi4gEQoUuIhIIFbqISCBU6CIigVChi4gEQoUuIhIIFbqISCBU6CIigVChi4gEQoUuIhIIFbqISCBU6CIigVChi4gEQoUuIhIIFbqISCAyKnQzG2pmb5vZajObmObxq8xspZktM7MXzKxT/EMVEZG9qbfQzSwHuAc4C+gGjDSzbrVmewModveewCzg5rgHKiIie5fJFnpfYLW7r3H3b4AZwHnVZ3D3Be6+Pbr7GlAQ7zBFRKQ+5u57n8FsBDDU3cdE9y8C+rn7uDrmvxv42N2npHlsLDAWID8/v8+MGTMyGuTWrVvJzc3NaN6Gamr28oov007Pbw0b/1FzWo8OeU3OTpfb0OykxpzkskgnqfUijtxsXBYhr29xZMeRW5eG/PsNHjx4ibsXp3usRZNHUo2ZXQgUA6ele9zd7wfuByguLvZBgwZllFtaWkqm8zZUU7NLJj6TdvqEHju5dXnNxbtuVMNeJ112utyGZic15iSXRTpJrRdx5Gbjsgh5fYsjO47cusS1LmdS6BVAx2r3C6JpNZjZD4FJwGnu/nWTRyYiIg2SyT70RcDxZtbFzA4GLgDmVJ/BzHoD9wHD3H1T/MMUEZH61Fvo7r4TGAfMA1YBM919hZlNNrNh0Wy3ALnAE2ZWZmZz6ogTEZGEZLQP3d3nAnNrTbu22u0fxjwuERFpoFg/FG3OOu/lA5TaH4Csu/GcfTEkEZFYNbtCT1e86UoXVLwiItXpWi4iIoFodlvoIunoLzeR+mkLXUQkENpClwOatvwlJNpCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhI5yERHZR5K+BIm20EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJhApdRCQQKnQRkUCo0EVEAqFCFxEJREaFbmZDzextM1ttZhPTPH6ImT0ePf66mXWOfaQiIrJX9Ra6meUA9wBnAd2AkWbWrdZsPwc+d/d/Am4Hbop7oCIisneZbKH3BVa7+xp3/waYAZxXa57zgOnR7VnAEDOz+IYpIiL1MXff+wxmI4Ch7j4mun8R0M/dx1Wb581onvXR/feieT6tlTUWGBvdPQF4O8NxHgl8Wu9cjZNUdrblJpmt3OSzsy03yexsy21odid3PyrdAy3iG0/93P1+4P6GPs/MFrt7cQJDSiw723KTzFZu8tnZlptkdrblxpmdyS6XCqBjtfsF0bS085hZCyAP2NzUwYmISOYyKfRFwPFm1sXMDgYuAObUmmcOMDq6PQL4m9e3L0dERGJV7y4Xd99pZuOAeUAO8Ad3X2Fmk4HF7j4HeBB4xMxWA5+RKv04NXg3TTPIzrbcJLOVm3x2tuUmmZ1tubFl1/uhqIiIZAedKSoiEggVuohIIJp1oZvZH8xsU3Sce5y5Hc1sgZmtNLMVZnZFTLmtzOy/zaw8yr0+jtxar5FjZm+Y2dMxZq4zs+VmVmZmi2PMPczMZpnZW2a2ysxOjin3hGisu3++MrN/iyn7yujf7k0ze8zMWsWUe0WUuaKpY033vjCzw83sr2b2bvTf9jHl/jQa8y4za/RhdXVk3xKtG8vM7CkzOyym3N9GmWVmNt/Mjo0jt9pjE8zMzezImMZ7nZlVVFufz25obhV3b7Y/wKlAEfBmzLnHAEXR7XbAO0C3GHINyI1utwReB/rHPPargD8DT8eYuQ44MoF/v+nAmOj2wcBhCbxGDvAxqZMtmprVAVgLtI7uzwRKYsjtDrwJtCF1IMLzwD81IW+P9wVwMzAxuj0RuCmm3BNJnQRYChTHPOYzgBbR7ZtiHPOh1W6PB+6NIzea3pHUASLvN+Y9U8d4rwN+1dT1zN2b9xa6u79E6qiZuHM3uPvS6PYWYBWpN3NTc93dt0Z3W0Y/sX3qbGYFwDnAf8SVmRQzyyO18j4I4O7fuPsXCbzUEOA9d38/prwWQOvofIo2wEcxZJ4IvO7u2919J/Ai8JPGhtXxvqh++Y3pwI/jyHX3Ve6e6RndDc2eHy0PgNdIneMSR+5X1e62pRHvwb10z+3A/21MZj25sWjWhb4vRFeG7E1qazqOvBwzKwM2AX9191hyI1NJrUy7YsyE1Mo538yWRJdniEMX4BPgoWgX0X+YWduYsqu7AHgsjiB3rwB+D3wAbAC+dPf5MUS/CQw0syPMrA1wNjVP1otDvrtviG5/DOTHnJ+0/w08G1eYmf3OzD4ERgHXxpR5HlDh7uVx5NUyLtpN9IfG7C7b7YAudDPLBf4T+Ldav9Ubzd0r3b2Q1NZGXzPrHkeumZ0LbHL3JXHk1fIv7l5E6oqa/8fMTo0hswWpPy2nuXtvYBupXQGxiU50GwY8EVNee1Jbul2AY4G2ZnZhU3PdfRWpXQrzgeeAMqCyqbl7eT0nxr8Mk2Zmk4CdwKNxZbr7JHfvGGWOq2/++kS/iP8fMf1yqGUacBxQSGpD4tbGBh2whW5mLUmV+aPu/mTc+dHuhQXA0JgiBwDDzGwdqSte/sDM/hRHcLRlirtvAp4idYXNploPrK/2F8osUgUfp7OApe6+Maa8HwJr3f0Td/8WeBI4JY5gd3/Q3fu4+6nA56Q+t4nTRjM7BiD676aY8xNhZiXAucCo6BdR3B4Fzo8h5zhSv+jLo/dgAbDUzI5uarC7b4w2BHcBD9CE998BWehmZqT27a5y99tizD1q9yf1ZtYaOB14K45sd/+1uxe4e2dSuxn+5u5N3no0s7Zm1m73bVIfVDX5qCJ3/xj40MxOiCYNAVY2NbeWkcS0uyXyAdDfzNpE68gQUp+vNJmZfSf673dJ7T//cxy51VS//MZo4L9izo+dmQ0ltQtxmLtvjzH3+Gp3zyOG96C7L3f377h75+g9uJ7UgRUfNzV79y/iyHCa8v6L45PVpH5IvVk3AN+SWoA/jyn3X0j9SbqM1J+/ZcDZMeT2BN6Ict8Erk1ouQwipqNcgK5AefSzApgU4zgLgcXR8pgNtI8xuy2pC8DlxbxsrydVAG8CjwCHxJT7MqlfaOXAkCZm7fG+AI4AXgDeJXUUzeEx5Q6Pbn8NbATmxTjm1cCH1d6DjTkaJV3uf0b/fsuAvwAd4sit9fg6GneUS7rxPgIsj8Y7BzimseuGTv0XEQnEAbnLRUQkRCp0EZFAqNBFRAKhQhcRCYQKXUQkECp0CY6ZjY+u7tigMw/NrLOZ/SypcYkkTYUuIfolcLq7j2rg8zoDDS50M8tp6HNEkqBCl6CY2b2kTpZ61swmRRc7+u/oAmHnRfN0NrOXzWxp9LP79P4bSV1Eqyy6LnqJmd1dLftpMxsU3d5qZreaWTlwspldGL1OmZndp5KX/UGFLkFx98tIXfJ2MKmzSf/m7n2j+7dElzfYRGoLvgj4V+DO6OkTgZfdvdDdb6/npdqSuiRuL1JnrP4rMMBTF2arJHWVP5F9qsX+HoBIgs4gdUGzX0X3WwHfJVX4d5tZIany/edGZFeSOsUcUtd86QMsSl0ChtZkycWxJCwqdAmZAed7rS9oMLPrSF2XpBepv1J31PH8ndT8K7b619HtcPfdl8A1YLq7/zqOQYs0lna5SMjmAZdHV07EzHpH0/OADZ66XOlFpL7GDmALqa8k3G0dUGhmB5lZR+q+rOkLwIhqV1Q83Mw6xfp/IpIBFbqE7LekvgZwmZmtiO4D/DswOvpA83ukvnwDUle7q7TUl3xfCbxC6jtGV5Laz7403Yu4+0rgGlLf+rQM+Cup760V2ad0tUURkUBoC11EJBAqdBGRQKjQRUQCoUIXEQmECl1EJBAqdBGRQKjQRUQC8f8BgwnM14Siz+4AAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df = pd.DataFrame({'feature':list(range(1,16)), 'rate of nonzero score':((feature > 0).sum(axis=0)/8783).values})\n", "ax = df.plot.bar(x='feature', y='rate of nonzero score', rot=0, grid=True)" ] }, { "cell_type": "code", "execution_count": 108, "id": "quick-watson", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123456789101112131415
021.59961020.68296616.61670016.7380700.0000000.0000000.0000000.00000023.0662380.018.57168616.56506714.09811014.8341250.000000
10.0000000.00000017.82968117.9550600.0000000.0000000.0000000.00000023.2442550.020.54407519.7959020.0000000.0000000.000000
20.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.0000000.00000013.733603
30.0000000.00000013.58403013.6913939.7828379.7828370.0000000.00000021.3102150.017.36810117.4810640.0000000.00000014.262909
40.0000000.00000014.19413014.26631410.03650910.03650914.34054214.65528723.0263000.012.06278012.2862950.0000000.0000000.000000
................................................
87780.0000000.0000000.0000000.0000005.4721885.4721880.0000000.0000000.0000000.010.71478710.6219510.0000000.00000011.901220
87790.0000000.0000008.8625788.5129569.2095099.2095098.9340179.08581913.4066170.011.30706511.2444680.0000000.0000000.000000
87800.0000000.0000000.0000000.0000007.2522797.2522790.0000000.0000000.0000000.010.1667059.7031000.0000000.0000000.000000
878111.45362110.80798212.64771512.3279560.0000000.0000000.0000000.00000014.8472950.011.63413911.38257513.58971213.6593420.000000
87820.0000000.0000000.0000000.0000008.4493778.4493770.0000000.0000000.0000000.010.39974210.2227920.0000000.0000000.000000
\n", "

8783 rows × 15 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 \\\n", "0 21.599610 20.682966 16.616700 16.738070 0.000000 0.000000 \n", "1 0.000000 0.000000 17.829681 17.955060 0.000000 0.000000 \n", "2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "3 0.000000 0.000000 13.584030 13.691393 9.782837 9.782837 \n", "4 0.000000 0.000000 14.194130 14.266314 10.036509 10.036509 \n", "... ... ... ... ... ... ... \n", "8778 0.000000 0.000000 0.000000 0.000000 5.472188 5.472188 \n", "8779 0.000000 0.000000 8.862578 8.512956 9.209509 9.209509 \n", "8780 0.000000 0.000000 0.000000 0.000000 7.252279 7.252279 \n", "8781 11.453621 10.807982 12.647715 12.327956 0.000000 0.000000 \n", "8782 0.000000 0.000000 0.000000 0.000000 8.449377 8.449377 \n", "\n", " 7 8 9 10 11 12 13 \\\n", "0 0.000000 0.000000 23.066238 0.0 18.571686 16.565067 14.098110 \n", "1 0.000000 0.000000 23.244255 0.0 20.544075 19.795902 0.000000 \n", "2 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 \n", "3 0.000000 0.000000 21.310215 0.0 17.368101 17.481064 0.000000 \n", "4 14.340542 14.655287 23.026300 0.0 12.062780 12.286295 0.000000 \n", "... ... ... ... ... ... ... ... \n", "8778 0.000000 0.000000 0.000000 0.0 10.714787 10.621951 0.000000 \n", "8779 8.934017 9.085819 13.406617 0.0 11.307065 11.244468 0.000000 \n", "8780 0.000000 0.000000 0.000000 0.0 10.166705 9.703100 0.000000 \n", "8781 0.000000 0.000000 14.847295 0.0 11.634139 11.382575 13.589712 \n", "8782 0.000000 0.000000 0.000000 0.0 10.399742 10.222792 0.000000 \n", "\n", " 14 15 \n", "0 14.834125 0.000000 \n", "1 0.000000 0.000000 \n", "2 0.000000 13.733603 \n", "3 0.000000 14.262909 \n", "4 0.000000 0.000000 \n", "... ... ... \n", "8778 0.000000 11.901220 \n", "8779 0.000000 0.000000 \n", "8780 0.000000 0.000000 \n", "8781 13.659342 0.000000 \n", "8782 0.000000 0.000000 \n", "\n", "[8783 rows x 15 columns]" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature" ] }, { "cell_type": "code", "execution_count": 118, "id": "verbal-cleaning", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123456789101112131415
021.5996120.68296616.61670016.7380700.0000000.0000000.0000000.00000023.0662380.018.57168616.56506714.0981114.8341250.000000
10.000000.00000017.82968117.9550600.0000000.0000000.0000000.00000023.2442550.020.54407519.7959020.000000.0000000.000000
20.000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.0000000.0000000.000000.00000013.733603
30.000000.00000013.58403013.6913939.7828379.7828370.0000000.00000021.3102150.017.36810117.4810640.000000.00000014.262909
40.000000.00000014.19413014.26631410.03650910.03650914.34054214.65528723.0263000.012.06278012.2862950.000000.0000000.000000
................................................
87770.000000.0000000.0000000.0000009.2357439.2357430.0000000.0000000.0000000.010.92448010.9955900.000000.00000011.798944
87780.000000.0000000.0000000.0000005.4721885.4721880.0000000.0000000.0000000.010.71478710.6219510.000000.00000011.901220
87790.000000.0000008.8625788.5129569.2095099.2095098.9340179.08581913.4066170.011.30706511.2444680.000000.0000000.000000
87800.000000.0000000.0000000.0000007.2522797.2522790.0000000.0000000.0000000.010.1667059.7031000.000000.0000000.000000
87820.000000.0000000.0000000.0000008.4493778.4493770.0000000.0000000.0000000.010.39974210.2227920.000000.0000000.000000
\n", "

7459 rows × 15 columns

\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 \\\n", "0 21.59961 20.682966 16.616700 16.738070 0.000000 0.000000 \n", "1 0.00000 0.000000 17.829681 17.955060 0.000000 0.000000 \n", "2 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "3 0.00000 0.000000 13.584030 13.691393 9.782837 9.782837 \n", "4 0.00000 0.000000 14.194130 14.266314 10.036509 10.036509 \n", "... ... ... ... ... ... ... \n", "8777 0.00000 0.000000 0.000000 0.000000 9.235743 9.235743 \n", "8778 0.00000 0.000000 0.000000 0.000000 5.472188 5.472188 \n", "8779 0.00000 0.000000 8.862578 8.512956 9.209509 9.209509 \n", "8780 0.00000 0.000000 0.000000 0.000000 7.252279 7.252279 \n", "8782 0.00000 0.000000 0.000000 0.000000 8.449377 8.449377 \n", "\n", " 7 8 9 10 11 12 13 \\\n", "0 0.000000 0.000000 23.066238 0.0 18.571686 16.565067 14.09811 \n", "1 0.000000 0.000000 23.244255 0.0 20.544075 19.795902 0.00000 \n", "2 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.00000 \n", "3 0.000000 0.000000 21.310215 0.0 17.368101 17.481064 0.00000 \n", "4 14.340542 14.655287 23.026300 0.0 12.062780 12.286295 0.00000 \n", "... ... ... ... ... ... ... ... \n", "8777 0.000000 0.000000 0.000000 0.0 10.924480 10.995590 0.00000 \n", "8778 0.000000 0.000000 0.000000 0.0 10.714787 10.621951 0.00000 \n", "8779 8.934017 9.085819 13.406617 0.0 11.307065 11.244468 0.00000 \n", "8780 0.000000 0.000000 0.000000 0.0 10.166705 9.703100 0.00000 \n", "8782 0.000000 0.000000 0.000000 0.0 10.399742 10.222792 0.00000 \n", "\n", " 14 15 \n", "0 14.834125 0.000000 \n", "1 0.000000 0.000000 \n", "2 0.000000 13.733603 \n", "3 0.000000 14.262909 \n", "4 0.000000 0.000000 \n", "... ... ... \n", "8777 0.000000 11.798944 \n", "8778 0.000000 11.901220 \n", "8779 0.000000 0.000000 \n", "8780 0.000000 0.000000 \n", "8782 0.000000 0.000000 \n", "\n", "[7459 rows x 15 columns]" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature.iloc[rating[rating != 0].index]" ] }, { "cell_type": "code", "execution_count": 119, "id": "hidden-nitrogen", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 1\n", "2 1\n", "3 1\n", "4 -1\n", " ..\n", "8777 -1\n", "8778 1\n", "8779 -1\n", "8780 1\n", "8782 -1\n", "Name: 0, Length: 7459, dtype: int64" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating[rating != 0]" ] }, { "cell_type": "code", "execution_count": 115, "id": "medical-athens", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,\n", " ...\n", " 8772, 8773, 8774, 8775, 8776, 8777, 8778, 8779, 8780, 8782],\n", " dtype='int64', length=7459)" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating[rating != 0].index" ] }, { "cell_type": "code", "execution_count": 72, "id": "contemporary-pasta", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.42344681012562146" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(feature > 0).sum(axis=0).sum() / (feature.shape[0]*feature.shape[1])" ] }, { "cell_type": "code", "execution_count": 19, "id": "received-running", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3907 1324 3552\n" ] } ], "source": [ "print((rating == 1).sum(), (rating == 0).sum(), (rating == -1).sum())" ] }, { "cell_type": "code", "execution_count": 123, "id": "supported-hardware", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.15074575885232835" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "1324 / 8783" ] }, { "cell_type": "code", "execution_count": 20, "id": "abroad-tiger", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3907 4876\n" ] } ], "source": [ "numbinrating = rating.apply(lambda x: 1 if x > 0 else 0)\n", "print((numbinrating == 1).sum(), (numbinrating == 0).sum())" ] }, { "cell_type": "code", "execution_count": 120, "id": "leading-backing", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 1\n", "2 1\n", "3 1\n", "4 0\n", " ..\n", "8777 0\n", "8778 1\n", "8779 0\n", "8780 1\n", "8782 0\n", "Name: 0, Length: 7459, dtype: int64" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numbinrating.iloc[rating[rating != 0].index]" ] }, { "cell_type": "code", "execution_count": 56, "id": "careful-louis", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import metrics, preprocessing\n", "import math" ] }, { "cell_type": "code", "execution_count": 121, "id": "refined-ethiopia", "metadata": {}, "outputs": [], "source": [ "balanced_accuracy = []\n", "avg_precision_score = []\n", "brier_loss = []\n", "f1_score = []\n", "coefs = []\n", "intercepts = []\n", "\n", "for rs in range(10):\n", " X_train,X_test,y_train,y_test = train_test_split(feature.iloc[rating[rating != 0].index], \n", " numbinrating.iloc[rating[rating != 0].index],\n", " test_size=0.2,random_state=rs)\n", " # standardization using min-max scaler\n", " #min_max_scaler = preprocessing.MinMaxScaler()\n", " #X_train_minmax = min_max_scaler.fit_transform(X_train)\n", " #X_test_minmax = min_max_scaler.transform(X_test)\n", " # logistic regression\n", " logit=LogisticRegression(fit_intercept=True,solver='liblinear')\n", " fitted=logit.fit(X_train,y_train)\n", " y_pred=logit.predict(X_test)\n", " balanced_accuracy.append(metrics.balanced_accuracy_score(y_test, y_pred))\n", " y_pred_p=logit.predict_proba(X_test)\n", " avg_precision_score.append(metrics.average_precision_score(y_test, y_pred_p.T[1]))\n", " brier_loss.append(metrics.brier_score_loss(y_test, y_pred_p.T[1]))\n", " f1_score.append(metrics.average_precision_score(y_test, y_pred))\n", " coefs.append(fitted.coef_[0])\n", " intercepts.append(fitted.intercept_[0])\n", " #print('balanced accuracy: {:.4f}'.format(metrics.balanced_accuracy_score(y_test, y_pred)))\n", " #print('average precision score: {:.4f}'.format(metrics.average_precision_score(y_test, y_pred_p.T[1])))\n", " #print('brier score loss: {:.4f}'.format(metrics.brier_score_loss(y_test, y_pred_p.T[1]))) # The smaller, the better\n", " #print('f1 score: {:.4f}'.format(metrics.average_precision_score(y_test, y_pred)))" ] }, { "cell_type": "code", "execution_count": 122, "id": "friendly-gnome", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "balanced accuracy: 0.6624\n", "average precision score: 0.7551\n", "brier score loss: 0.2073\n", "f1 score: 0.6293\n" ] } ], "source": [ "# no standardization\n", "print('balanced accuracy: {:.4f}'.format(np.mean(balanced_accuracy)))\n", "print('average precision score: {:.4f}'.format(np.mean(avg_precision_score)))\n", "print('brier score loss: {:.4f}'.format(np.mean(brier_loss))) # The smaller, the better\n", "print('f1 score: {:.4f}'.format(np.mean(f1_score)))" ] }, { "cell_type": "code", "execution_count": 131, "id": "printable-singer", "metadata": {}, "outputs": [], "source": [ "balanced_accuracy = []\n", "avg_precision_score = []\n", "brier_loss = []\n", "f1_score = []\n", "coefs = []\n", "intercepts = []\n", "\n", "for rs in range(10):\n", " X_train,X_test,y_train,y_test = train_test_split(feature,numbinrating,test_size=0.2,random_state=rs)\n", " # standardization using min-max scaler\n", " #min_max_scaler = preprocessing.MinMaxScaler()\n", " #X_train_minmax = min_max_scaler.fit_transform(X_train)\n", " #X_test_minmax = min_max_scaler.transform(X_test)\n", " # logistic regression\n", " logit=LogisticRegression(fit_intercept=True,solver='liblinear')\n", " fitted=logit.fit(X_train,y_train)\n", " y_pred=logit.predict(X_test)\n", " balanced_accuracy.append(metrics.balanced_accuracy_score(y_test, y_pred))\n", " y_pred_p=logit.predict_proba(X_test)\n", " avg_precision_score.append(metrics.average_precision_score(y_test, y_pred_p.T[1]))\n", " brier_loss.append(metrics.brier_score_loss(y_test, y_pred_p.T[1]))\n", " f1_score.append(metrics.average_precision_score(y_test, y_pred))\n", " coefs.append(fitted.coef_[0])\n", " intercepts.append(fitted.intercept_[0])\n", " #print('balanced accuracy: {:.4f}'.format(metrics.balanced_accuracy_score(y_test, y_pred)))\n", " #print('average precision score: {:.4f}'.format(metrics.average_precision_score(y_test, y_pred_p.T[1])))\n", " #print('brier score loss: {:.4f}'.format(metrics.brier_score_loss(y_test, y_pred_p.T[1]))) # The smaller, the better\n", " #print('f1 score: {:.4f}'.format(metrics.average_precision_score(y_test, y_pred)))" ] }, { "cell_type": "code", "execution_count": 103, "id": "thorough-shopper", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "balanced accuracy: 0.6306\n", "average precision score: 0.6506\n", "brier score loss: 0.2153\n", "f1 score: 0.5318\n" ] } ], "source": [ "# no standardization\n", "print('balanced accuracy: {:.4f}'.format(np.mean(balanced_accuracy)))\n", "print('average precision score: {:.4f}'.format(np.mean(avg_precision_score)))\n", "print('brier score loss: {:.4f}'.format(np.mean(brier_loss))) # The smaller, the better\n", "print('f1 score: {:.4f}'.format(np.mean(f1_score)))" ] }, { "cell_type": "code", "execution_count": null, "id": "literary-conversation", "metadata": {}, "outputs": [], "source": [ "# best\n", "balanced accuracy: 0.6342\n", "average precision score: 0.6821\n", "brier score loss: 0.2137\n", "f1 score: 0.5533" ] }, { "cell_type": "code", "execution_count": 124, "id": "joined-destination", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# no standardization\n", "df = pd.DataFrame({'feature':list(range(1,16)), 'coefficients':np.stack(coefs).mean(axis=0)})\n", "ax = df.plot(x='feature', y='coefficients', rot=0, kind='bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 125, "id": "broken-taylor", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.02159655, 0.04977869, 0.05276279, 0.04668993, 0.02615154,\n", " 0.02615154, 0.00636762, 0.01282327, -0.02016103, 0. ,\n", " -0.02192281, 0.02003289, -0.04702737, 0.03902806, 0.10792615])" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.stack(coefs).mean(axis=0)" ] }, { "cell_type": "code", "execution_count": 126, "id": "blond-middle", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.02422864, 0.0258807 , 0.00526667, 0.00627771, 0.00087469,\n", " 0.00087469, 0.01432958, 0.01491875, 0.00288218, 0. ,\n", " 0.0042263 , 0.00431502, 0.00625123, 0.00565562, 0.00302476])" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.stack(coefs).std(axis=0)" ] }, { "cell_type": "code", "execution_count": 130, "id": "stylish-prague", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1.172818154019393" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(intercepts)" ] }, { "cell_type": "code", "execution_count": 100, "id": "anticipated-bibliography", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# no standardization\n", "df = pd.DataFrame({'feature':list(range(1,16)), 'coefficients':np.stack(coefs).mean(axis=0)})\n", "ax = df.plot(x='feature', y='coefficients', rot=0, kind='bar', grid=True)" ] }, { "cell_type": "code", "execution_count": 99, "id": "filled-intervention", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.00722761, 0.021936 , 0.04056063, 0.03559395, 0.02052407,\n", " 0.02052407, 0.01702212, 0.00057635, -0.01129512, 0. ,\n", " -0.01127191, 0.0138696 , -0.03131791, 0.02770065, 0.09263194])" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.stack(coefs).mean(axis=0)" ] }, { "cell_type": "code", "execution_count": 97, "id": "international-reward", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.00777849, 0.00867834, 0.00309854, 0.00398819, 0.00066269,\n", " 0.00066269, 0.01262642, 0.01286337, 0.00246544, 0. ,\n", " 0.00341984, 0.00401186, 0.00823876, 0.00863262, 0.00237907])" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.stack(coefs).std(axis=0)" ] }, { "cell_type": "code", "execution_count": 132, "id": "precious-reader", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1.3638519776835112" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(intercepts)" ] }, { "cell_type": "code", "execution_count": null, "id": "isolated-implementation", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "venv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.7" } }, "nbformat": 4, "nbformat_minor": 5 }