Python數(shù)據(jù)分析招式:pandas庫提取清洗排序-1
發(fā)布時間:2021-11-23 點擊數(shù):611
要點:
數(shù)據(jù)的基本處理
數(shù)據(jù)的提取
數(shù)據(jù)的初步清洗
數(shù)據(jù)的排序
泰坦尼克數(shù)據(jù)集下載地址:
地址1(需要注冊): https://www.kaggle.com/c/titanic/data
地址2(百度網(wǎng)盤): https://pan.baidu.com/s/1Vp0QmVLu43_Hb9jHR2FKXg
密碼: rdfr
導入數(shù)據(jù)
# -*- coding: utf-8 -*- # @File : 泰坦尼克數(shù)據(jù)分析.py # @Date : 2018-06-03 import numpy as np import pandas as pd file = "data/train.csv" df = pd.DataFrame(pd.read_csv(file))
1、數(shù)據(jù)的基本處理
# 形狀 print(df.shape) # (891, 12) # 查看前3行 print(df.head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 3 ... 7.2500 NaN S 1 2 1 1 ... 71.2833 C85 C 2 3 1 3 ... 7.9250 NaN S [3 rows x 12 columns] """ # 查看后3行 print(df.tail(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 888 889 0 3 ... 23.45 NaN S 889 890 1 1 ... 30.00 C148 C 890 891 0 3 ... 7.75 NaN Q [3 rows x 12 columns] """ # 信息 print(df.info()) """ <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB None """ # 整體描述 print(df.describe()) """ PassengerId Survived ... Parch Fare count 891.000000 891.000000 ... 891.000000 891.000000 mean 446.000000 0.383838 ... 0.381594 32.204208 std 257.353842 0.486592 ... 0.806057 49.693429 min 1.000000 0.000000 ... 0.000000 0.000000 25% 223.500000 0.000000 ... 0.000000 7.910400 50% 446.000000 0.000000 ... 0.000000 14.454200 75% 668.500000 1.000000 ... 0.000000 31.000000 max 891.000000 1.000000 ... 6.000000 512.329200 [8 rows x 7 columns] """ # 查看數(shù)據(jù)集的空值 print(df.isnull().sum()) """ PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 """ # 唯一值 print(df["Pclass"].unique()) # [3 1 2]
2、數(shù)據(jù)的提取
# 按照索引的值提取數(shù)據(jù) print(df.loc[630]) """ PassengerId 631 Survived 1 Pclass 1 Name Barkworth, Mr. Algernon Henry Wilson Sex male Age 80 SibSp 0 Parch 0 Ticket 27042 Fare 30 Cabin A23 Embarked S Name: 630, dtype: object """ # 取部分行和列 第二三四行和前5列 print(df.iloc[2:5, :5]) """ PassengerId ... Sex 2 3 ... female 3 4 ... female 4 5 ... male [3 rows x 5 columns] """ # 照條件提取 倉位為小于2的,并且性別為女性的數(shù)據(jù) print(df[(df["Pclass"]<=2)&(df["Sex"]=="female")].head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 1 2 1 1 ... 71.2833 C85 C 3 4 1 1 ... 53.1000 C123 S 9 10 1 2 ... 30.0708 NaN C [3 rows x 12 columns] """
3、數(shù)據(jù)的清洗
# 刪除空值 print(df.shape) # (891, 12) ret = df.dropna(how="any") print(ret.shape) # (183, 12) print(df.shape) # (891, 12) # 填充空值 ret = df.fillna(value=0) print(df.loc[633]) print(ret.loc[633]) """ PassengerId 634 Survived 0 Pclass 1 Name Parr, Mr. William Henry Marsh Sex male Age NaN SibSp 0 Parch 0 Ticket 112052 Fare 0 Cabin NaN Embarked S Name: 633, dtype: object PassengerId 634 Survived 0 Pclass 1 Name Parr, Mr. William Henry Marsh Sex male Age 0 SibSp 0 Parch 0 Ticket 112052 Fare 0 Cabin 0 Embarked S Name: 633, dtype: object """ #用數(shù)據(jù)集里面的年齡均值來填充空值 ret = df['Age'].fillna(df['Age'].mean()) print(ret.shape) # (891,) # 對字符的處理,比如大小寫的轉(zhuǎn)換 print(df["Name"].map(str.upper).head(3)) """ 0 BRAUND, MR. OWEN HARRIS 1 CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH... 2 HEIKKINEN, MISS. LAINA Name: Name, dtype: object """ # 對字符串的快速映射轉(zhuǎn)換 df['Pclass']=df['Pclass'].map({1:'一等艙',2:'二等艙',3:'三等艙'}) print(df.head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 0 1 0 三等艙 ... 7.2500 NaN S 1 2 1 一等艙 ... 71.2833 C85 C 2 3 1 三等艙 ... 7.9250 NaN S [3 rows x 12 columns] """ # 對數(shù)據(jù)集中的數(shù)據(jù)格式的改變 print(df.dtypes) """ PassengerId int64 Survived int64 Pclass object Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object """ ret = df['Fare'].astype('int') #把原來的float64->int print(ret.dtypes) # int32 # 更改列的名字 ret = df.rename(columns={'Survived':'是否獲救'}) print(ret.head(3)) """ PassengerId 是否獲救 Pclass ... Fare Cabin Embarked 0 1 0 三等艙 ... 7.2500 NaN S 1 2 1 一等艙 ... 71.2833 C85 C 2 3 1 三等艙 ... 7.9250 NaN S [3 rows x 12 columns] """ # 去掉重復值 # #比如我們想知道登船的類別,去掉所有重復的數(shù)據(jù) ret = df['Embarked'].drop_duplicates() print(ret) """ 0 S 1 C 5 Q 61 NaN Name: Embarked, dtype: object """ # 數(shù)據(jù)的代替,替換 df['Sex']=df['Sex'].replace('male','男') print(df["Sex"].head(3)) """ 0 男 1 female 2 female Name: Sex, dtype: object """
4、數(shù)據(jù)的排序
# 按照年齡進行降序排列 print(df.sort_values(by=['Age'],ascending=False)["Age"].head(3)) """ 630 80.0 851 74.0 493 71.0 Name: Age, dtype: float64 """ # 按照index來排序 print(df.sort_index(ascending=False).head(3)) """ PassengerId Survived Pclass ... Fare Cabin Embarked 890 891 0 三等艙 ... 7.75 NaN Q 889 890 1 一等艙 ... 30.00 C148 C 888 889 0 三等艙 ... 23.45 NaN S """