Pandas_Guide_For_Teammates

import numpy as np
import pandas as pd

์‹œ๋ฆฌ์ฆˆ

  • ์ธ๋ฑ์Šค๋Š” ๋ฐ์ดํ„ฐ๊ฐ’๊ณผ ์ผ๋Œ€์ผ ๋งค์นญ
  • ๋ฐ์ดํ„ฐ๊ฐ€ ๋‚˜์—ด๋œ 1์ฐจ์› ํ˜•์‹์˜ ๋ฐฐ์—ด ํ˜•ํƒœ

์‹œ๋ฆฌ์ฆˆ ์ƒ์„ฑ

  • ๋”•์…”๋„ˆ๋ฆฌ ๋ฐ์ดํ„ฐ ์ƒ์„ฑ
  • Sr = pd.Series(data=๋”•์…”๋„ˆ๋ฆฌ๋ฐ์ดํ„ฐ์ด๋ฆ„, index=[])
series_data = {"ํŒ€์›1":"๊น€์Šน๊ทœ","ํŒ€์›2":"๋ฐ•์„ฑ์ค€","ํŒ€์›3":"๊น€์•„๋žŒ"}
sr = pd.Series(data=series_data)
sr
ํŒ€์›1    ๊น€์Šน๊ทœ
ํŒ€์›2    ๋ฐ•์„ฑ์ค€
ํŒ€์›3    ๊น€์•„๋žŒ
dtype: object

์ธ๋ฑ์Šค ๊ตฌ์กฐ

  • ์ธ๋ฑ์Šค๊ฐ’ ๋ฐฐ์—ด

    • ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด.index
  • ๋ฐ์ดํ„ฐ๊ฐ’ ๋ฐฐ์—ด

    • ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด.values
sr.index
Index(['ํŒ€์›1', 'ํŒ€์›2', 'ํŒ€์›3'], dtype='object')
sr.values
array(['๊น€์Šน๊ทœ', '๋ฐ•์„ฑ์ค€', '๊น€์•„๋žŒ'], dtype=object)

์›์†Œ ์„ ํƒ

  • ์ •์ˆ˜ํ˜• ์œ„์น˜ ์ธ๋ฑ์Šค: iloc.๋Œ€๊ด„ํ˜ธ([])์•ˆ์— ์œ„์น˜๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ์ˆซ์ž ์ž…๋ ฅ.
  • ๋ฌธ์ž์—ญ ์œ„์น˜ ์ธ๋ฑ์Šค: loc.๋Œ€๊ด„ํ˜ธ([])์•ˆ์— ์ธ๋ฑ์Šค ์ด๋ฆ„ ์ž…๋ ฅ.
sr.iloc[2]
'๊น€์•„๋žŒ'
sr.loc["ํŒ€์›2"]
'๋ฐ•์„ฑ์ค€'

๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„

  • ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์€ ๋ฐ์ดํ„ฐ ๋ฐฐ์—ด ํ˜•ํƒœ.
  • ์—ฌ๋Ÿฌ๊ฐœ์˜ ์—ด ๋ฒกํ„ฐ๋“ค์ด ๊ฐ™์€ ํ–‰ ์ธ๋ฑ์Šค ๊ธฐ์ค€์œผ๋กœ ์ค„์ง€์–ด ๊ฒฐํ•ฉ๋œ 2์ฐจ์› ๋ฒกํ„ฐ ๋˜๋Š” ํ–‰๋ ฌ.

๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„ ๋งŒ๋“ค๊ธฐ

  • pandas.DataFrame(๋”•์…”๋„ˆ๋ฆฌํ˜•ํƒœ ๊ฐ์ฒด ์ด๋ฆ„)
  • df = pandas.DataFrame(๋”•์…”๋„ˆ๋ฆฌํ˜•ํƒœ ๊ฐ์ฒด ์ด๋ฆ„)
ํšŒ์‚ฌID๋ฆฌ์ŠคํŠธ = {"ํšŒ์‚ฌID1":"1232134","ํšŒ์‚ฌID2":"5839204","ํšŒ์‚ฌID3":"13224213"}
df = pd.DataFrame(ํšŒ์‚ฌID๋ฆฌ์ŠคํŠธ, index = ["1","2","3"])
display(df)
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 1232134 5839204 13224213
2 1232134 5839204 13224213
3 1232134 5839204 13224213

ํ–‰ ์ด๋ฆ„๊ณผ ์ธ๋ฑ์Šค ์ด๋ฆ„ ์„ค์ •

  • pd.DataFrame(๋ฐ์ดํ„ฐ์ด๋ฆ„, index = [], columns =[])

ํ–‰ ์ด๋ฆ„๊ณผ ์ธ๋ฑ์Šค ์ด๋ฆ„ ๋ฐ”๊พธ๊ธฐ

  • df.index = [์ƒˆ๋กœ์šด ํ–‰ ์ด๋ฆ„ ๋ฐฐ์—ด]
  • df.columns = [์ƒˆ๋กœ์šด ์—ด ์ด๋ฆ„ ๋ฐฐ์—ด]

ํŠน์ • ํ–‰ ์ด๋ฆ„๊ณผ ์ธ๋ฑ์Šค ์ด๋ฆ„ ๋ฐ”๊พธ๊ธฐ

  • df.rename(index={"์˜ˆ์ „์ด๋ฆ„":"๋ฐ”๊ฟ€์ด๋ฆ„","์˜ˆ์ „์ด๋ฆ„":"๋ฐ”๊ฟ€์ด๋ฆ„"})
  • df.rename(columns={"์˜ˆ์ „์ด๋ฆ„":"๋ฐ”๊ฟ€์ด๋ฆ„","์˜ˆ์ „์ด๋ฆ„":"๋ฐ”๊ฟ€์ด๋ฆ„"})
df.rename(index={"1":"1_ํšŒ์‚ฌ"}) #์‹ค์ œ df์—๋Š” ์˜ํ–ฅ์„ ์ฃผ์ง€ ์•Š์Œ.
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1_ํšŒ์‚ฌ 1232134 5839204 13224213
2 1232134 5839204 13224213
3 1232134 5839204 13224213
df.rename(columns={"ํšŒ์‚ฌID1":"ํšŒ์‚ฌID_1"}) #์‹ค์ œ df์—๋Š” ์˜ํ–ฅ์„ ์ฃผ์ง€ ์•Š์Œ.
ํšŒ์‚ฌID_1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 1232134 5839204 13224213
2 1232134 5839204 13224213
3 1232134 5839204 13224213

ํ–‰,์—ด ์‚ญ์ œ

ํ–‰, ์—ด์„ ์†Œ๊ฑฐ์‹œ drop()๋ฉ”์„œ๋“œ๋ฅผ ์‚ฌ์šฉํ•˜๊ณ  ํ–‰์€ axis=0 ์—ด์€ axis=1๋กœ ์„ค์ •ํ•œ๋‹ค.

  • df.drop(ํ–‰ ์ธ๋ฑ์Šค ๋˜๋Š” ๋ฆฌ์ŠคํŠธ, axis = 0)
  • df.drop(์—ด ์ธ๋ฑ์Šค ๋˜๋Š” ๋ฆฌ์ŠคํŠธ, axis = 1)
df.drop("1", axis=0) #์‹ค์ œ df์—๋Š” ์˜ํ–ฅ์„ ์ฃผ์ง€ ์•Š์Œ.
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
2 1232134 5839204 13224213
3 1232134 5839204 13224213
df.drop(["1","2"], axis=0) #์‹ค์ œ df์—๋Š” ์˜ํ–ฅ์„ ์ฃผ์ง€ ์•Š์Œ.
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
3 1232134 5839204 13224213
df.drop("ํšŒ์‚ฌID1", axis=1)
#df.drop(df[["ํšŒ์‚ฌID1"]], axis=1) ์™€๋„ ๊ฐ™๋‹ค.
ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 5839204 13224213
2 5839204 13224213
3 5839204 13224213
#df = df.drop(index=df.query('์—ด์ด๋ฆ„ ๋น„๊ต์—ฐ์‚ฐ์ž ์กฐ๊ฑด').index)

ํ–‰ ์ธ๋ฑ์Šค ์„ ํƒ

  • ๋ฐ์ดํ„ฐํ”„๋ž˜์ž„์˜ ํ–‰ ์ธ๋ฑ์Šค๋ฅผ ์„ ํƒํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” loc์™€ iloc์ธ๋ฑ์„œ๋ฅผ ์‚ฌ์šฉํ•œ๋‹ค.
  • ๋ฌธ์žํƒ€์ž…์˜ ์ธ๋ฑ์Šค๋ฅผ ์„ ํƒํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” loc
  • ์ •์ˆ˜ํ˜•ํƒ€์ž…์˜ ์ธ๋ฑ์Šค๋ฅผ ์„ ํƒํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” iloc์„ ์‚ฌ์šฉํ•œ๋‹ค.
๊ตฌ๋ถ„ loc iloc
ํƒ์ƒ‰๋Œ€์ƒ ์ธ๋ฑ์Šค์ด๋ฆ„ ์ •์ˆ˜ํ˜•์œ„์น˜
๋ฒ”์œ„์ง€์ • ๊ฐ€๋Šฅ(๋ฒ”์œ„๋ํฌํ•จ) ๊ฐ€๋Šฅ(๋ฒ”์œ„๋์ œ์™ธ)
ํ…Œ์ŠคํŠธ1 loc['a':'c'] = 'a','b','c' iloc[1:4] = 1,2,3
df.loc["1"]
ํšŒ์‚ฌID1     1232134
ํšŒ์‚ฌID2     5839204
ํšŒ์‚ฌID3    13224213
Name: 1, dtype: object
df.loc["1":"3"]
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 1232134 5839204 13224213
2 1232134 5839204 13224213
3 1232134 5839204 13224213
df.loc[["1","3"]]
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 1232134 5839204 13224213
3 1232134 5839204 13224213

์—ด ์„ ํƒ

  • ์—ด 1๊ฐœ๋งŒ ์„ ํƒํ• ๋•Œ: df["์—ด ์ด๋ฆ„"] or df.์—ด ์ด๋ฆ„
  • ์—ด์„ ์—ฌ๋Ÿฌ๊ฐœ ์„ ํƒํ• ๋•Œ: df[["",""]]
df["ํšŒ์‚ฌID1"]
1    1232134
2    1232134
3    1232134
Name: ํšŒ์‚ฌID1, dtype: object
df[["ํšŒ์‚ฌID1","ํšŒ์‚ฌID2"]]
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2
1 1232134 5839204
2 1232134 5839204
3 1232134 5839204

์›์†Œ ์„ ํƒ

  • ํ–‰ ์ธ๋ฑ์Šค์™€ ์—ด ์ด๋ฆ„์„ [ํ–‰,์—ด]ํ˜•์‹์˜ 2์ฐจ์› ์ขŒํ‘œ๋กœ ์ž…๋ ฅํ•œ๋‹ค.

    • ์ธ๋ฑ์Šค ์ด๋ฆ„ ๋ฐฉ์‹: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„.loc[ํ–‰ ์ธ๋ฑ์Šค, ์—ด ๋ฒˆํ˜ธ]
    • ์ •์ˆ˜ ์œ„์น˜ ์ธ๋ฑ์Šค ๋ฐฉ์‹: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„.iloc[ํ–‰ ๋ฒˆํ˜ธ, ์—ด ๋ฒˆํ˜ธ]
df.loc["2","ํšŒ์‚ฌID2"] #df.loc[2]["ํšŒ์‚ฌID"]์™€ ๊ฐ™์Œ. ํ•˜์ง€๋งŒ ์—ฌ๋Ÿฌ ํ–‰,์—ด ์„ค์ •์„ ํ•ด์ฃผ๊ธฐ ์œ„ํ•ด์„  ๋ฆฌ์ŠคํŠธ๋กœ ๋ฌถ์–ด์ค˜์•ผํ•จ. 
'5839204'
df.loc[["1","3"], ["ํšŒ์‚ฌID1","ํšŒ์‚ฌID3"]]
ํšŒ์‚ฌID1 ํšŒ์‚ฌID3
1 1232134 13224213
3 1232134 13224213
df.loc["1":"3", ["ํšŒ์‚ฌID1","ํšŒ์‚ฌID3"]]
ํšŒ์‚ฌID1 ํšŒ์‚ฌID3
1 1232134 13224213
2 1232134 13224213
3 1232134 13224213

ํ–‰ ์ถ”๊ฐ€

  • df.loc[์ธ๋ฑ์Šค ์ด๋ฆ„] = ๋ฐ์ดํ„ฐ๊ฐ’ ๋˜๋Š” ๋ฆฌ์ŠคํŠธ
df.loc["4"] = ["2324124","1314213","124214123"] #df์— ๋ณ€ํ™”๊ฐ€ ๋ฐ”๋กœ ์ ์šฉ
df
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3
1 1232134 5839204 13224213
2 1232134 5839204 13224213
3 1232134 5839204 13224213
4 2324124 1314213 124214123

์—ด ์ถ”๊ฐ€

  • df["์ƒˆ๋กœ์šด ์—ด ์ด๋ฆ„"] = ๋ฐ์ดํ„ฐ๊ฐ’ ๋˜๋Š” ๋ฆฌ์ŠคํŠธ
df["ํšŒ์‚ฌID4"] = ["423","23532","235","8679"] #df์— ๋ณ€ํ™”๊ฐ€ ๋ฐ”๋กœ ์ ์šฉ
df
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
1 1232134 5839204 13224213 423
2 1232134 5839204 13224213 23532
3 1232134 5839204 13224213 235
4 2324124 1314213 124214123 8679

์›์†Œ ๊ฐ’ ๋ณ€๊ฒฝ

  • df.loc["ํ–‰ ์ด๋ฆ„"]["์—ด ์ด๋ฆ„"] = ์ƒˆ๋กœ์šด ๊ฐ’
  • df.iloc["ํ–‰ ๋ฒˆํ˜ธ"]["์—ด ๋ฒˆํ˜ธ] = ์ƒˆ๋กœ์šด ๊ฐ’
df.loc["1"]["ํšŒ์‚ฌID1"] = 3 #df์— ๋ณ€ํ™”๊ฐ€ ๋ฐ”๋กœ ์ ์šฉ
df.loc["1"]["ํšŒ์‚ฌID1":"ํšŒ์‚ฌID4"] = 3 #df์— ๋ณ€ํ™”๊ฐ€ ๋ฐ”๋กœ ์ ์šฉ
df
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
1 3 3 3 3
2 1232134 5839204 13224213 23532
3 1232134 5839204 13224213 235
4 2324124 1314213 124214123 8679

ํŠน์ • ์—ด์„ ํ–‰ ์ธ๋ฑ์Šค๋กœ ์„ค์ •

  • df.set_index(["์—ด ์ด๋ฆ„๋“ค"])
  • df.set_index("์—ด ์ด๋ฆ„")

ํ–‰ ์ธ๋ฑ์Šค ์žฌ๋ฐฐ์—ด

  • ๊ธฐ์กด ๊ฐ์ฒด๋ฅผ ๋ณ€๊ฒฝํ•˜์ง€ ์•Š๊ณ  ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐํ”„๋ž˜์ž„ ๊ฐ์ฒด๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค
new_obj = ['comp_row1','comp_row2','comp_row3','comp_row4'] #df์— ๋ณ€ํ™”๊ฐ€ ๋ฐ”๋กœ ์ ์šฉ
df_new = df.reindex(new_obj)
df_new
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
comp_row1 NaN NaN NaN NaN
comp_row2 NaN NaN NaN NaN
comp_row3 NaN NaN NaN NaN
comp_row4 NaN NaN NaN NaN

์žฌ๋ฐฐ์—ด๋œ df_new ๋ณ€์ˆ˜์— ๊ฐ’ ํ• ๋‹น

df_new.loc["comp_row1":"comp_row4"]["ํšŒ์‚ฌID1"] = ["31","34","213","421"]
df_new
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
comp_row1 31 NaN NaN NaN
comp_row2 34 NaN NaN NaN
comp_row3 213 NaN NaN NaN
comp_row4 421 NaN NaN NaN
df
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
1 3 3 3 3
2 1232134 5839204 13224213 23532
3 1232134 5839204 13224213 235
4 2324124 1314213 124214123 8679

ํ–‰ ์ธ๋ฑ์Šค ์ดˆ๊ธฐํ™”

  • df.reset_index()

ํ–‰ ์ธ๋ฑ์Šค๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ฐ์ดํ„ฐ ์ •๋ ฌ

  • df.sort_index(ascending = True ๋˜๋Š” False)
df.sort_index(ascending = False)
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
4 2324124 1314213 124214123 8679
3 1232134 5839204 13224213 235
2 1232134 5839204 13224213 23532
1 3 3 3 3

์—ด ์ธ๋ฑ์Šค๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ฐ์ดํ„ฐ ์ •๋ ฌ

df. sort_values(by = "์—ด์ด๋ฆ„", ascending True ๋˜๋Š” False)

df["ํšŒ์‚ฌID1"] = df["ํšŒ์‚ฌID1"].astype(int)
df.sort_values(by = "ํšŒ์‚ฌID1", ascending=False)
ํšŒ์‚ฌID1 ํšŒ์‚ฌID2 ํšŒ์‚ฌID3 ํšŒ์‚ฌID4
4 2324124 1314213 124214123 8679
2 1232134 5839204 13224213 23532
3 1232134 5839204 13224213 235
1 3 3 3 3

์‚ฐ์ˆ ์—ฐ์‚ฐ

  • ํŒ๋‹ค์Šค์˜ ์‚ฐ์ˆ  ์—ฐ์‚ฐ์€ 3๋‹จ๊ณ„๋ฅผ ๊ฑฐ์นœ๋‹ค.

    1. ํ–‰/์—ด ์ธ๋ฑ์Šค๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋ชจ๋“  ์›์†Œ๋ฅผ ์ •๋ ฌํ•œ๋‹ค.
    2. ๋™์ผํ•œ ์œ„์น˜์— ์žˆ๋Š” ์›์†Œ๋ผ๋ฆฌ ์ผ๋Œ€์ผ๋กœ ๋Œ€์‘์‹œํ‚จ๋‹ค.
    3. ์ผ๋Œ€์ผ ๋Œ€์‘๋˜๋Š” ์›์†Œ๋ผ๋ฆฌ ์—ฐ์‚ฐ์„ ์ฒ˜๋ฆฌํ•œ๋‹ค.
    4. ๋Œ€์‘๋˜๋Š” ์›์†Œ๊ฐ€ ์—†๋‹ค๋ฉด nan์œผ๋กœ ์ฒ˜๋ฆฌํ•œ๋‹ค.

์‹œ๋ฆฌ์ฆˆ (+-*/) ์ˆซ์ž

  • ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด์— ์–ด๋–ค ์ˆซ์ž๋ฅผ ์—ฐ์‚ฐํ•˜๋ฉด ์‹œ๋ฆฌ์ฆˆ์˜ ๊ฐœ๋ณ„ ์›์†Œ์— ๊ฐ๊ฐ ์ ์šฉ๋œ ๊ฐ์ฒด๋กœ ๋ฐ˜ํ™˜๋œ๋‹ค.
d = {"index1":1,"index2":2,"index3":3}
sr2 = pd.Series(d)
display(sr2)
index1    1
index2    2
index3    3
dtype: int64
sr2+2 #sr2์— ๋ฐ”๋กœ ์ ์šฉ ์•ˆ๋จ. 
index1    3
index2    4
index3    5
dtype: int64

์‹œ๋ฆฌ์ฆˆ (+-*/) ์‹œ๋ฆฌ์ฆˆ

  • ์‹œ๋ฆฌ์ฆˆ๊ฐ€ ๋‹ค๋ฅธ ์‹œ๋ฆฌ์ฆˆ์™€ ์—ฐ์‚ฐํ• ๋•Œ ๊ฐ™์€ ์ธ๋ฑ์Šค๋ผ๋ฆฌ ์—ฐ์‚ฐํ•˜๋‹ค.
  • ์ธ๋ฑ์Šค๊ฐ€ ์ •๋ ฌ์ด ์•ˆ๋˜์–ด์žˆ๋‹คํ•ด๋„ ํŒ๋‹ค์Šค๋Š” ์•Œ์•„์„œ ๊ฐ™์€ ์ธ๋ฑ์Šค๋ฅผ ์ฐพ์•„ ์—ฐ์‚ฐํ•œ๋‹ค.
  • ์—ฐ์‚ฐ์„ ํ•˜๋Š” ๋‘ ์‹œ๋ฆฌ์ฆˆ์˜ ํฌ๊ธฐ๊ฐ€ ๋‹ค๋ฅด๊ฑฐ๋‚˜ ํฌ๊ธฐ๊ฐ€ ๊ฐ™๋”๋ผ๋„ ์ธ๋ฑ์Šค๊ฐ€ ๋‹ค๋ฅด๋‹ค๋ฉด ํŒ๋‹จ์Šค๋Š” ์œ ํšจํ•œ ๊ฐ™์ด ์—†๋‹ค๊ณ  ์ƒ๊ฐํ•˜์—ฌ nan์„ ์ถ”์ถœํ•œ๋‹ค.

์‹œ๋ฆฌ์ฆˆ ํฌ๊ธฐ๊ฐ€ ๊ฐ™๊ณ  ์ธ๋ฑ์Šค๋„ ๊ฐ™์€ ์ƒํ™ฉ

d = {"index1":4, "index2":9, "index3":10}
sr3 = pd.Series(data=d)
sr2+sr3
index1     5
index2    11
index3    13
dtype: int64

์‹œ๋ฆฌ์ฆˆ ํฌ๊ธฐ๋Š” ๊ฐ™์€๋ฐ ์ธ๋ฑ์Šค๊ฐ€ ๋‹ค๋ฅธ ์ƒํ™ฉ

d = {"index4":3, "index5":1, "index6":3}
sr4 = pd.Series(data=d)
sr3+sr4
index1   NaN
index2   NaN
index3   NaN
index4   NaN
index5   NaN
index6   NaN
dtype: float64

์‹œ๋ฆฌ์ฆˆ ํฌ๊ธฐ๋Š” ๋‹ค๋ฅด๊ณ  ์ธ๋ฑ์Šค๋Š” ๊ฐ™์€ ์ƒํ™ฉ

d = {"index1":4, "index2":23490, "index3":33,"index4":123}
sr5 = pd.Series(data = d)
sr2+sr5
index1        5.0
index2    23492.0
index3       36.0
index4        NaN
dtype: float64

์—ฐ์‚ฐ ๋ฉ”์†Œ๋“œ

  • ๊ฐ์ฒด ์‚ฌ์ด์˜ ๊ณตํ†ต๋œ ์ธ๋ฑ์Šค๊ฐ€ ์—†๊ฑฐ๋‚˜ Nan์ด ํฌํ•จ๋œ ๊ฒฝ์šฐ ์—ฐ์‚ฐ ๊ฒฐ๊ณผ๊ฐ€ Nan์ด ๋ฐ˜ํ™˜๋˜๋Š”๋ฐ ์ด๋ฅผ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด์„œ ๋ฉ”์†Œ๋“œ์•ˆ์— fill_value ์˜ต์…˜์„ ์จ์ค€๋‹ค.
  • ๋ง์…ˆ: ์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด1.add(์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด2, fill_value=0)
  • ๋บ„์…ˆ: ์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด1.sub(์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด2, fill_value=0)
  • ๊ณฑ์…ˆ: ์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด1.mul(์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด2, fill_value=0)
  • ๋‚˜๋ˆˆ์…ˆ: ์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด1.div(์‹œ๋ฆฌ์ฆˆ๊ฐ์ฒด2, fill_value=0)
sr2.add(sr4, fill_value=0)
index1    1.0
index2    2.0
index3    3.0
index4    3.0
index5    1.0
index6    3.0
dtype: float64
sr3.sub(sr2, fill_value=0)
index1    3
index2    7
index3    7
dtype: int64
sr3
index1     4
index2     9
index3    10
dtype: int64

๋ฐ์ดํ„ฐ ์ž…์ถœ๋ ฅ

CSV

CSVํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

  • pd.read_csv('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.csv')

CSV๋กœ ์ €์žฅํ•˜๊ธฐ

  • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„.to_csv('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.csv')

Excel

Excel ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

  • pandas.read_excel('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.xlsx')

Excel ํŒŒ์ผ๋กœ ์ €์žฅํ•˜๊ธฐ

  • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„.to_excel('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.xlsx')

์—ฌ๋Ÿฌ๊ฐœ์˜ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ ํ•œ๊ฐœ์˜ excelํŒŒ์ผ๋กœ ์ €์žฅ

  • writer = pandas.ExcelWriter("ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.xlsx")
  • df1.toexcel(writer, sheetname="sheet1")
  • df2.toexcel(writer, sheetname="sheet2")
  • writer.save()

Json

Json ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

  • pandas.read_json('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ์ด๋ฆ„.json')

Json ํŒŒ์ผ๋กœ ์ €์žฅํ•˜๊ธฐ

  • df.to_json('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….json')

HTML

HTML ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

  • ์›น์— ์žˆ๋Š” ํƒœ๊ทธ๊ฐ€ ๋ถ™์€
  • pandas.read_html('์›น์ฃผ์†Œ url')
  • pandas.read_html('ํŒŒ์ผ๊ฒฝ๋กœ/ํŒŒ์ผ๋ช….html')
  • ๋ฐ์ดํ„ฐ ์‚ดํŽด๋ณด๊ธฐ

    ๊ฐ ์—ด์˜ ๊ฐœ์ˆ˜

    • count() ๋ฉ”์„œ๋“œ๋Š” ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ๊ฐ ์—ด์ด ๊ฐ€์ง€๊ณ  ์žˆ๋Š” ๋ฐ์ดํ„ฐ ๊ฐœ์ˆ˜๋ฅผ ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
    • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„.count() --> ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ ์ด๋ฃจ๊ณ  ์žˆ๋Š” ์—ด์˜ ๊ฐœ์ˆ˜๋ฅผ ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.

    ๊ฐ ์—ด์˜ ๊ณ ์œณ๊ฐ’ ๊ฐœ์ˆ˜

    • value_counts() ๋ฉ”์„œ๋“œ๋Š” ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด์˜ ๊ณ ์œณ๊ฐ’ ๊ฐœ์ˆ˜๋ฅผ ์„ธ๋Š”๋ฐ ์‚ฌ์šฉํ•œ๋‹ค.
    • dropna=True ์˜ต์…˜์„ ์‚ฌ์šฉํ•˜๋ฉด nan๊ฐ’์„ ์ œ์™ธํ•˜๊ณ  ์ˆซ์ž๋ฅผ ์„ผ๋‹ค.
    • dropna=False๊ฐ€ ๊ธฐ๋ณธ๊ฐ’์ด๋‹ค. ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„["์—ด์ด๋ฆ„"].value_counts()
    import pandas as pd
    import numpy as np
    data = {"A": [1,2,3,4,5],"B":[3,59,30,1,2],"C":["dw","vc","qw","bb","ll"]}
    df = pd.DataFrame(data)
    df.count() #์ „์ฒด ๋ฐ์ดํ„ฐ์˜ ์—ด์ด ๊ฐ€์ง€๊ณ  ์žˆ๋Š” ๊ฐฏ์ˆ˜ 
    A    5
    B    5
    C    5
    dtype: int64
    df["A"].value_counts() #ํ•œ ์—ด์ด ๊ฐ€์ง€๊ณ  ์žˆ๋Š” value๊ฐ’๋“ค์˜ ๊ณ ์œ  ๊ฐฏ์ˆ˜ counting 
    5    1
    4    1
    3    1
    2    1
    1    1
    Name: A, dtype: int64

    ํ†ต๊ณ„ ํ•จ์ˆ˜ ์ ์šฉ

    ํ‰๊ท ๊ฐ’

    • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— mean()๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด, ์‚ฐ์ˆ  ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ–๋Š” ๋ชจ๋“  ์—ด์˜ ํ‰๊ท ๊ฐ’์„ ๊ฐ๊ฐ ๊ณ„์‚ฐํ•˜์—ฌ ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
    • df.mean() df์˜ ์—ด ํ‰๊ท ์„ ๊ณ„์‚ฐ
    • df["์—ด์ด๋ฆ„"].mean() ์„ ํƒ ๋ฐ›์€ ์—ด์˜ ํ‰๊ท  ๊ณ„์‚ฐ.

    ์ค‘๊ฐ„๊ฐ’

    • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— median()๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด, ์‚ฐ์ˆ  ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ–๋Š” ๋ชจ๋“  ์—ด์˜ ์ค‘๊ฐ„๊ฐ’์„ ๊ฐ๊ฐ ๊ณ„์‚ฐํ•˜์—ฌ ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
    • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ํŠน์ • ์—ด์„ ์„ ํƒํ•˜์—ฌ ์ค‘๊ฐ„ ๊ฐ’์„ ๊ณ„์‚ฐํ•  ์ˆ˜๋„ ์žˆ๋‹ค.
    • df.median()
    • df["์—ด์ด๋ฆ„"].median()

    ์ตœ๋Œ€๊ฐ’

    • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— max() ๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ๊ฐ ์—ด์ด ๊ฐ–๋Š” ๋ฐ์ดํ„ฐ๊ฐ’ ์ค‘์—์„œ ์ตœ๋Œ€๊ฐ’์„ ๊ณ„์‚ฐํ•˜์—ฌ ์‹œ๋ฆฌ์ฆˆ๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค. \
    • ํŠน์ •์—ด์„ ์„ ์ฑ…ํ•˜์—ฌ ๊ณ„์‚ฐํ•  ์ˆ˜๋„ ์žˆ๋‹ค.
    • ๋ฌธ์ž์—ด ๋ฐ์ดํ„ฐ๋Š” ์•„์Šคํ‚ค์ฝ”๋“œ๋กœ ํฌ๊ณ  ์ž‘์Œ์„ ๋น„๊ตํ•œ๋‹ค.
    • ๋ชจ๋“  ์—ด์˜ ์ตœ๋Œ€๊ฐ’: df.max()
    • ํŠน์ • ์—ด์˜ ์ตœ๋Œ€๊ฐ’: df["์—ด ์ด๋ฆ„"].max()
    df
    A B C
    0 1 3 dw
    1 2 59 vc
    2 3 30 qw
    3 4 1 bb
    4 5 2 ll
df.max()
A     5
B    59
C    vc
dtype: object
df["A"].max()
5
df[["A","B"]].max()
A     5
B    59
dtype: int64

์ตœ์†Œ๊ฐ’

  • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— min() ๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ๊ฐ ์—ด์ด ๊ฐ–๋Š” ๋ฐ์ดํ„ฐ๊ฐ’ ์ค‘์—์„œ ์ตœ์†Œ๊ฐ’์„ ๊ณ„์‚ฐํ•˜์—ฌ ์‹œ๋ฆฌ์ฆˆ ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
  • ํŠน์ •์—ด์„ ๊ณ„์‚ฐํ•  ์ˆ˜๋„ ์žˆ๋‹ค.
  • ๋ฌธ์ž์—ด ๋ฐ์ดํ„ฐ๋Š” ์•„์Šคํ‚ค์ฝ”๋“œ๋กœ ํฌ๊ณ  ์ž‘์Œ์„ ๊ณ„์‚ฐํ•œ๋‹ค.
  • ๋ชจ๋“  ์—ด์˜ ์ตœ์†Œ๊ฐ’: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.min()
  • ํŠน์ • ์—ด์˜ ์ตœ์†Œ๊ฐ’: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์น˜["์—ด๊ฐ’"].min()

ํ‘œ์ค€ํŽธ์ฐจ

  • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— std() ๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด ์‚ฐ์ˆ  ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ–๋Š” ์—ด์˜ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ ๊ณ„์‚ฐํ•˜์—ฌ ์‹œ๋ฆฌ์ฆˆ๋กœ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
  • ํŠน์ •์—ด๋งŒ ๊ณ„์‚ฐํ•  ์ˆ˜๋„ ์žˆ๋‹ค.
  • ๋ฌธ์ž์—ด ๋ฐ์ดํ„ฐ๋Š” ๋น„๊ตํ•˜์ง€ ์•Š๋Š”๋‹ค.
  • df.std()
  • df["์—ด์ด๋ฆ„"].std()

์ƒ๊ด€๊ณ„์ˆ˜

  • ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— corr()๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด ๋‘ ์—ด๊ฐ„์˜ ์ƒ๊ด€๊ณ„์ˆ˜๋ฅผ ๊ณ„์‚ฐํ•œ๋‹ค.
  • ์‚ฐ์ˆ  ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ–๋Š” ๋ชจ๋“  ์—ด์— ๋Œ€ํ•ด 2๊ฐœ์”ฉ ์„œ๋กœ ์ง์„ ์ง“๊ณ , ๊ฐ๊ฐ์˜ ๊ฒฝ์šฐ์— ๋Œ€ํ•˜์—ฌ ์ƒ๊ด€๊ณ„์ˆ˜๋ฅผ ๊ณ„์‚ฐํ•œ๋‹ค.
  • ๋ฌธ์ž์—ด ๋ฐ์ดํ„ฐ๋Š” ๊ณ„์‚ฐ์ด ๋ถˆ๊ฐ€๋Šฅํ•˜๊ธฐ๋•Œ๋ฌธ์— ํฌํ•จํ•˜์ง€ ์•Š๋Š”๋‹ค.
  • df.corr()
  • df.[["์—ด์ด๋ฆ„","์—ด์ด๋ฆ„]].corr()
df.corr()
A B
A 1.000000 -0.372822
B -0.372822 1.000000
df[["A","B"]].corr()
A B
A 1.000000 -0.372822
B -0.372822 1.000000

ํŒ๋‹ค์Šค ๋‚ด์žฅ ๊ทธ๋ž˜ํ”„ ๋„๊ตฌ ํ™œ์šฉ

  • ์‹œ๋ฆฌ์ฆˆ ๋˜๋Š” ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ๊ฐ์ฒด์— plot()๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•˜์—ฌ ๊ทธ๋ž˜ํ”„๋ฅผ ๊ทธ๋ฆฐ ๋’ค, kind ์˜ต์…˜์œผ๋กœ ์ข…๋ฅ˜๋ฅผ ์„ ํƒํ•œ๋‹ค.
kind ์˜ต์…˜ ์„ค๋ช… kind ์˜ต์…˜ ์„ค๋ช…
'line' ์„  ๊ทธ๋ž˜ํ”„ 'kde' ์ปค๋„ ๋ฐ€๋„ ๊ทธ๋ž˜ํ”„
'bar' ์ˆ˜์ง ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ 'area' ๋ฉด์  ๊ทธ๋ž˜ํ”„
'barh' ์ˆ˜ํ‰ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ 'pie' ํŒŒ์ด ๊ทธ๋ž˜ํ”„
'his' ํžˆ์Šคํ† ๊ทธ๋žจ 'scatter' ์‚ฐ์ ๋„ ๊ทธ๋ž˜ํ”„
'box' '๋ฐ•์Šค ๊ทธ๋ž˜ํ”„' 'hexbin' ๊ณ ๋ฐ€๋„ ์‚ฐ์ ๋„ ๊ทธ๋ž˜ํ”„

์„  ๊ทธ๋ž˜ํ”„

  • ๋ฐ์ดํ„ฐํ”„๋ž˜์ž„ ๊ฐ์ฒด์— plot()๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•  ๋•Œ ๋‹ค๋ฅธ ์˜ต์…˜์„ ์ถ”๊ฐ€ํ•˜์ง€ ์•Š์œผ๋ฉด ๊ฐ€์žฅ ๊ธฐ๋ณธ์ ์ธ ์„  ๊ทธ๋ž˜ํ”„๋ฅผ ๊ทธ๋ฆฐ๋‹ค.
  • ์„  ๊ทธ๋ž˜ํ”„: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.plot()

๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„

  • plot()๋ฉ”์Šค๋“œ ์•ˆ์— kind="bar"์˜ต์…˜์„ ์ถ”๊ฐ€ํ•œ๋‹ค.
  • ๋ง‰๋Œ€๊ทธ๋ž˜ํ”„ : ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.plot(kind="bar")

ํžˆ์Šคํ† ๊ทธ๋žจ

  • plot()๋ฉ”์„œ๋“œ ์•ˆ์— kind="hist"์˜ต์…˜์„ ์ถ”๊ฐ€ํ•œ๋‹ค
  • ํžˆ์Šคํ† ๊ทธ๋žจ: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.plot(kind="hist")
df.plot(kind="hist")
<matplotlib.axes._subplots.AxesSubplot at 0x11910b7d0>

png

์‚ฐ์ ๋„

  • df.plot(x="ํŠน์ • ์—ด", y="ํŠน์ • ์—ด", kind="scatter")
df.plot(x='A', y='B', kind="scatter")
<matplotlib.axes._subplots.AxesSubplot at 0x11a2fb250>

png

๋ฐ•์Šคํ”Œ๋กฏ

  • plot()๋ฉ”์„œ๋“œ ์•ˆ์— kind="box"์˜ต์…˜์„ ์ž…๋ ฅํ•œ๋‹ค.
  • ๋ฐ•์Šคํ”Œ๋กฏ์„ ํ†ตํ•ด 'o'ํ‘œ์‹œ๋ฅผ ํ†ตํ•ด ์ด์ƒ๊ฐ’์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.
  • ๋ฐ•์Šคํ”Œ๋กฏ: ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„['์—ด ์ด๋ฆ„'.'์—ด ์ด๋ฆ„', ~].plot(kind="box")
  • ์ด๋•Œ ํ•˜๋‚˜๋งŒ ๋ณด๊ณ  ์‹ถ๋‹ค๋ฉด ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— ์ปฌ๋Ÿผ์„ ํ•˜๋‚˜๋งŒ ์ง€์ •ํ•˜๋ฉด ๋œ๋‹ค.

์‹œ๊ฐํ™” ๋„๊ตฌ

Matplotlib - ๊ธฐ๋ณธ ๊ทธ๋ž˜ํ”„ ๋„๊ตฌ

์„  ๊ทธ๋ž˜ํ”„

- ์„  ๊ทธ๋ž˜ํ”„๋Š” ์—ฐ์†ํ•˜๋Š” ๋ฐ์ดํ„ฐ ๊ฐ’๋“ค์„ ์ง์„  ๋˜๋Š” ๊ณก์„ ์œผ๋กœ ์—ฐ๊ฒฐํ•˜์—ฌ ๋ฐ์ดํ„ฐ ๊ฐ’ ์‚ฌ์ด์˜ ๊ด€๊ณ„๋ฅผ ๋‚˜ํƒ€๋‚ธ๋‹ค
- ํŠนํžˆ ์‹œ๊ณ„์—ด ๋ฐ์ดํ„ฐ์™€ ๊ฐ™์ด ์—ฐ์†์ ์ธ ๊ฐ’์˜ ๋ณ€ํ™”์™€ ํŒจํ„ด์„ ํŒŒ์•…ํ•˜๋Š”๋ฐ ์ ํ•ฉํ•˜๋‹ค.
- ์„  ๊ทธ๋ž˜ํ”„๋Š” ๊ธฐ๋ณธ ์˜ต์…˜์ด๊ธฐ ๋•Œ๋ฌธ์— ์˜ต์…˜์„ ์„ค์ •ํ•˜์ง€ ์•Š๊ณ  plot ํ•จ์ˆ˜๋ฅผ ์“ฐ๋ฉด ์„  ๊ทธ๋ž˜ํ”„๊ฐ€ ๋‚˜์˜จ๋‹ค.

1. ์Šคํƒ€์ผ ์„œ์‹ ์ง€์ •
plt.style.use('ggplot')

2. ๊ทธ๋ฆผ ์‚ฌ์ด์ฆˆ ์ง€์ •
plt.figure(figsize=(๊ฐ€๋กœ์ˆซ์ž,์„ธ๋กœ์ˆซ์ž))

3. x์ถ•, y์ถ• ๋ฐ์ดํ„ฐ๋ฅผ plot ํ•จ์ˆ˜์— ์ž…๋ ฅ --> ์‹œ๋ฆฌ์ฆˆ์˜ ์ธ๋ฑ์Šค๋ฅผ x์ถ• ๋ฐ์ดํ„ฐ๋กœ, ๋ฐ์ดํ„ฐ๊ฐ’์„ y์ถ•๋ฐ์ดํ„ฐ๋กœ ์ „๋‹ฌ
plt.plot(x์ถ•, y์ถ•, marker="์ง€์ • ์•ŒํŒŒ๋ฒณ", markersize=์ˆซ์ž์ž…๋ ฅ)
#ํŒ๋‹ค์Šค ๊ฐ์ฒด ์ž์ฒด๋ฅผ plotํ•จ์ˆ˜์— ์ž…๋ ฅํ•˜๋Š” ๊ฒƒ๋„ ๊ฐ€๋Šฅํ•˜๋‹ค. 

4. ๊ทธ๋ž˜ํ”„ ๊ฐ์ฒด์— ์ฐจํŠธ ์ œ๋ชฉ์„ ์ถ”๊ฐ€ํ•  ๋•Œ title()ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•œ๋‹ค.
plt.title("์ œ๋ชฉ ์ž…๋ ฅ", size=์ˆซ์ž) 

5. ์ถ• ์ด๋ฆ„ ์ถ”๊ฐ€ 
plt.xlabel("x์ถ• ์ด๋ฆ„", size=์ˆซ์ž)
plt.ylabel("y์ถ• ์ด๋ฆ„", size=์ˆซ์ž)

6. matplotlib ํ•œ๊ธ€ ๋ฌธ์ œ ํ•ด๊ฒฐ --> ํ•ด๋‹น ์ฝ”๋“œ๋ฅผ ๊ทธ๋Œ€๋กœ ์“ฐ๋ฉด ๋œ๋‹ค.
from matplotlib import font_manager, rc 
font_path = "./ํฐํŠธํŒŒ์ผ์œ„์น˜/ํฐํŠธ.ttf"
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

7. x์ถ•, y์ถ• ๋ฒ”์œ„ ์ง€์ • (์ตœ์†Ÿ๊ฐ’, ์ตœ๋Œ“๊ฐ’)
plt.xlim(์ตœ์†Ÿ๊ฐ’, ์ตœ๋Œ“๊ฐ’)
plt.ylim(์ตœ์†Ÿ๊ฐ’, ์ตœ๋Œ“๊ฐ’) 

8. x์ถ• ๋ˆˆ๊ธˆ ๋ผ๋ฒจ ํšŒ์ „ / vertical ๋Œ€์‹  ์ˆซ์ž๋ฅผ ์ž…๋ ฅํ•ด๋„ ๋œ๋‹ค.
plt.xsticks(rotation= "vertical")

9. ๋ฒ”๋ก€ํ‘œ์‹œ / best๋Š” ์ตœ์ ์˜ ์œ„์น˜๋ฅผ ์„ ์ •ํ•ด์ฃผ๋Š” ์ž๋™๊ธฐ๋Šฅ์ด๋‹ค. 
plt.legend(labels=["๋ฒ”๋ก€ ์ด๋ฆ„"], loc="best", fontsize=์ˆซ์ž)

10. ๊ทธ๋ž˜ํ”„ ์ถœ๋ ฅ
plt.show()  

๋ฐ์ดํ„ฐ ์‚ฌ์ „ ์ฒ˜๋ฆฌ

๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ

- ์œ ํšจํ•œ ๋ฐ์ดํ„ฐ ๊ฐ’์ด ์กด์žฌํ•˜์ง€ ์•Š๋Š” ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฅผ NaN์œผ๋กœ ํ‘œ์‹œํ•œ๋‹ค. 
- ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ํ™•์ธ
    - df.info()๋ฉ”์„œ๋“œ๋กœ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์š”์•ฝ์ •๋ณด๋ฅผ ์ถœ๋ ฅํ•˜๋ฉด NaN ๊ฐ’์˜ ๊ฐœ์ˆ˜๋ฅผ ๋ณด์—ฌ์ค€๋‹ค. 
    - df["์—ด"].value_counts() ๋ฉ”์„œ๋“œ๋กœ ํŠน์ • ์—ด์˜ ๋ˆ„๋ฝ ๋ฐ์ดํ„ธ๋ฅด ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค. 
        - ์ด๋•Œ ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ์˜ ๊ฐœ์ˆ˜๋ฅผ ํ™•์ธํ•˜๋ ค๋ฉด ๋ฐ˜๋“œ์‹œ dropna = False๋ผ๋Š” ์˜ต์…˜์„ ์ ์šฉํ•ด์•ผ ํ•œ๋‹ค. ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด NaN๊ฐ’์„ ์ œ์™ธํ•˜๊ณ  ์œ ์š”ํ•œ ๋ฐ
          ์ดํ„ฐ ๊ฐœ์ˆ˜๋งŒ์„ ๊ตฌํ•œ๋‹ค.
    - df.isnull()๋ฉ”์„œ๋“œ์™€ notnull()๋ฉ”์„œ๋“œ๋ฅผ ํ†ตํ•ด ์ง์ ‘์ ์œผ๋กœ ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์žˆ๋‹ค. 
        - isnull(): ๋ˆ„๋ฝ๋ฐ์ดํ„ฐ๋ผ๋ฉด True ๋ฐ˜ํ™˜, ์œ ํšจํ•œ ๋ฐ์ดํ„ฐ๋ฉด False๋ฅผ ๋ฐ˜ํ™˜
        - notnull(): ์œ ํšจํ•œ ๋ฐ์ดํ„ฐ๋ฉด True ๋ฐ˜ํ™˜, ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฉด False๋ฅผ ๋ฐ˜ํ™˜
        - ์ด๋•Œ True๋Š” 1๋กœ ๊ณ„์‚ฐ๋˜๊ณ  False๋Š” 0์œผ๋กœ ํŒ๋ณ„๋˜๊ธฐ ๋•Œ๋ฌธ์—, sum(axis=0) ๋ฉ”์†Œ๋“œ๋ฅผ ํ™œ์šฉํ•˜์—ฌ True(1)์˜ ํ•ฉ์„ ๊ตฌํ•  ์ˆ˜ ์žˆ๋‹ค. 
            - print(df.head().isnull().sum(axis=0))

๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ์ œ๊ฑฐ

- dropna() ๋ฉ”์„œ๋“œ๋ฅผ ์ ์šฉํ•ด NaN ๊ฐ’์„ ๊ฐ–๋Š” ํ–‰ ๋˜๋Š” ์—ด์„ ์‚ญ์ œํ•  ์ˆ˜ ์žˆ๋‹ค. 
- subset์˜ต์…˜์œผ๋กœ ์—ด์„ ํ•œ์ •ํ•  ์ˆ˜ ์žˆ๋‹ค.
- how ์˜ต์…˜์œผ๋กœ NaN๊ฐ’์— ๋”ฐ๋ผ ์‚ญ์ œ ์กฐ๊ฑด์„ ์ค„ ์ˆ˜ ์žˆ๋‹ค.
- ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ์‚ญ์ œ(๊ธฐ์ค€์ ): ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.dropna(axis=0 ๋˜๋Š” 1, thresh=n)
- ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ์‚ญ์ œ(์กฐ๊ฑด): ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„๊ฐ์ฒด.dropna(subset=['์—ด ์ด๋ฆ„'], how='any' or 'all', axis =0 ๋˜๋Š” 1)
์ด๋•Œ n์€ ๊ฒฐ์ธก์น˜๊ฐ€ n๊ฐœ ์ด์ƒ์ธ ํ–‰ ๋˜๋Š” ์—ด์„ ์‚ญ์ œํ•˜๋ผ๋Š” ๊ธฐ์ค€์ ์„ ์ฃผ๋Š” ๊ฒƒ์ด๋‹ค.
how ์˜ต์…˜์˜ any๋Š” NaN๊ฐ€ ํ•˜๋‚˜๋ผ๋„ ์กด์žฌํ•˜๋Š” ํ–‰ ๋˜๋Š” ์—ด์„ ์‚ญ์ œํ•˜๋ผ๋Š” ์˜๋ฏธ๊ณ , all์€ ๋ชจ๋“  ๋ฐ์ดํ„ฐ๊ฐ€ NaN์ธ ๊ฒฝ์šฐ์—๋งŒ ์‚ญ์ œํ•˜๋ผ๋Š” ์˜๋ฏธ๋‹ค. 
import seaborn as sns
df = sns.load_dataset('titanic')
#๊ฐ ์—ด์˜ NaN ๊ฐœ์ˆ˜๋ฅผ ๊ณ„์‚ฐํ•˜๊ธฐ ์œ„ํ•ด์„œ for ๋ฐ˜๋ณต๋ฌธ์œผ๋กœ ๊ฐ ์—ด์˜ NaN ๊ฐœ์ˆ˜ ๊ณ„์‚ฐํ•˜๊ธฐ. 
missing_df = df.isnull()
for col in missing_df.columns:
    missing_count = missing_df[col].value_counts()
    try:
        print(col,":", missing_count[True]) #NaN๊ฐ’์ด ์žˆ์œผ๋ฉด ๊ฐœ์ˆ˜ ์ถœ๋ ฅ
    except:
        print(col,":", 0) #Nan๊ฐ’์ด ์—†์œผ๋ฉด 0๊ฐœ ์ถœ๋ ฅ
survived : 0
pclass : 0
sex : 0
age : 177
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 688
embark_town : 2
alive : 0
alone : 0
df_thres = df.dropna(axis=1, thresh=500)
print(df_thres.columns)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

891๋ช…์˜ ์Šน๊ฐ์ค‘์—์„œ 177๋ช…์˜ ๋‚˜์ด์— ๋Œ€ํ•œ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋‹ค. ์Šน๊ฐ์˜ ๋‚˜์ด๊ฐ€ ๋ฐ์ดํ„ฐ ๋ถ„์„์˜ ์ค‘์š”ํ•œ ๋ณ€์ˆ˜๋ผ๋ฉด, ๋‚˜์ด ๋ฐ์ดํ„ฐ๊ฐ€ ์—†๋Š” ์Šน๊ฐ์˜ ๋ ˆ์ฝ”๋“œ(ํ–‰)๋ฅผ ์ œ๊ฑฐํ•˜๋Š” ๊ฒƒ์ด ์ข‹๋‹ค. dropna() ๋ฉ”์†Œ๋“œ์— subset์„ 'age'์—ด๋กœ ํ•œ์ •ํ•˜๋ฉด 'age'์—ด์˜ ํ–‰ ์ค‘์—์„œ NaN๊ฐ’์ด ์žˆ๋Š” ๋ชจ๋“  ํ–‰(axis=0)์„ ์‚ญ์ œํ•œ๋‹ค. ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ how="any" ์˜ต์…˜์ด ์ ์šฉ๋˜๋Š”๋ฐ, NaN๊ฐ’์ด ํ•˜๋‚˜๋ผ๋„ ์กด์žฌํ•˜๋ฉด ์‚ญ์ œํ•œ๋‹ค๋Š” ๋œป์ด๋‹ค. how="all" ์˜ต์…˜์œผ๋กœ ์ž…๋ ฅํ•˜๋ฉด ๋ชจ๋“  ๋ฐ์ดํ„ฐ๊ฐ€ NaN๊ฐ’์ผ ๊ฒฝ์šฐ์—๋งŒ ์‚ญ์ œ๊ฐ€ ๋œ๋‹ค. ์˜ˆ์ œ์—์„œ๋Š” 891๊ฐœ์˜ ํ–‰ ์ค‘์—์„œ ๋‚˜์ด ๋ฐ์ดํ„ฐ๊ฐ€ ๋ˆ„๋ฝ๋œ 177๊ฐœ ํ–‰์„ ์‚ญ์ œํ•˜๊ณ  ๋‚˜๋จธ์ง€ 714๊ฐœ์˜ ํ–‰์„ df_age์— ์ €์žฅํ•œ๋‹ค

df_age = df.dropna(subset=["age"], how="any", axis=0)
missing_data = df_age.isnull()
for col in missing_data.columns:
    eda = missing_data[col].value_counts()
    try:
        print(col, ":",eda[True])
    except:
        print(col, ":",0)
survived : 0
pclass : 0
sex : 0
age : 0
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 530
embark_town : 2
alive : 0
alone : 0

๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ ์น˜ํ™˜

- ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฅผ ๋ฐ”๊ฟ”์„œ ๋Œ€์ฒดํ•  ๊ฐ’์œผ๋กœ๋Š” ๋ฐ์ดํ„ฐ์˜ ๋ถ„ํฌ์™€ ํŠน์„ฑ์„ ์ž˜ ๋‚˜ํƒ€๋‚ผ ์ˆ˜ ์žˆ๋Š” ํ‰๊ท ๊ฐ’, ์ตœ๋นˆ๊ฐ’ ๋“ฑ์„ ํ™œ์šฉํ•œ๋‹ค. 
- ํŒ๋‹ค์Šค์—์„œ๋Š” fillna() ๋ฉ”์†Œ๋“œ๋ฅผ ์ƒใ…‡ํ•œ๋‹ค.
- ์›๋ณธ ๊ฐ์ฒด๋ฅผ ๋ณ€๊ฒฝํ•˜๋ ค๋ฉด inplace = True์˜ต์…˜์„ ์ถ”๊ฐ€ํ•ด์•ผ ํ•œ๋‹ค. 
- ํ‰๊ท (Mean)์œผ๋กœ ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฅผ ๋ฐ”๊ฟ”์ฃผ๋Š” ๋ฐฉ๋ฒ•์„ ์•Œ์•„๋ณด์ž. ์•ž์˜ ์˜ˆ์ œ์ฒ˜๋Ÿผ ์Šน๊ฐ์˜ ๋‚˜์ด ๋ฐ์ดํ„ฐ๊ฐ€ ๋ˆ„๋ฝ๋œ ํ–‰์„ ์ œ๊ฑฐํ•˜์ง€ ์•Š๊ณ , ๋Œ€์‹  'age'์—ด์˜ ๋‚˜๋จธ
์ง€ ์Šน๊ฐ์˜ ํ‰๊ท  ๋‚˜์ด๋กœ ์น˜ํ™˜ํ•˜๋‹ค. ๋จผ์ € 'age'์—ด์— ๋“ค์–ด์žˆ๋Š” ๊ฐ’๋“ค์˜ ํ‰๊ท ์„ ๊ณ„์‚ฐํ•˜์—ฌ mean_age์— ์ €์žฅํ•œ๋‹ค. mean() ๋ฉ”์†Œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด NaN์„ ์ œ์™ธ
ํ•˜๊ณ  ํ‰๊ท ์„ ๊ณ„์‚ฐํ•œ๋‹ค. fillna() ๋ฉ”์†Œ๋“œ์— mean_age๋ฅผ ์ธ์ž๋กœ ์ „๋‹ฌํ•˜๋ฉด NaN์„ ์ฐพ์•„์„œ mean_age๊ฐ’์œผ๋กœ ์น˜ํ™˜ํ•œ๋‹ค. 

์ •์ˆ˜ํ˜• ๋ฐ์ดํ„ฐ

import seaborn as sns 
df = sns.load_dataset('titanic')
print(df['age'].head(10))
print('\n')
mean_age = df['age'].mean(axis=0)
df["age"].fillna(mean_age, inplace=True)

#age ์—ด์˜ ์ฒซ 10๊ฐœ ๋ฐ์ดํ„ฐ ์ถœ๋ ฅ(5ํ–‰์— NaN๊ฐ’์ด ํ‰๊ท ์œผ๋กœ ๋Œ€์ฒด)
print(df['age'].head(10))

๋ฒ”์ฃผํ™” ๋ฐ์ดํ„ฐ

์Šน์„ ๋„์‹œ๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” 'embarktown'์—ด์— ์žˆ๋Š” NaN์„ ๋‹ค๋ฅธ ๊ฐ’์œผ๋กœ ๋ฐ”๊พผ๋‹ค. ์Šน๊ฐ๋“ค์ด ๊ฐ€์žฅ ๋งŽ์ด ์Šน์„ ํ•œ ๋„์‹œ์˜ ์ด๋ฆ„์„ ์ฐพ์•„์„œ NaN์„ ์น˜ํ™˜ํ•œ๋‹ค. ๋จผ์ € valuecounts() ๋ฉ”์†Œ๋“œ๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์Šน์„ ๋„์‹œ๋ณ„ ์Šน๊ฐ ์ˆ˜๋ฅผ ์ฐพ๊ณ , idxmax() ๋ฉ”์†Œ๋“œ๋กœ ๊ฐ€์žฅ ํฐ ๊ฐ’์„ ๊ฐ–๋Š” ๋„์‹œ๋ฅผ ์ฐพ๋Š”๋‹ค. ์‹คํ–‰ ๊ฒฐ๊ณผ์—์„œ 829ํ–‰์˜ NaN๊ฐ’์„ ํฌํ•จํ•ด์„œ ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋“ค์€ Southamption์œผ๋กœ ๋ณ€๊ฒฝํ•œ๋‹ค

import seaborn as sns

df = sns.load_dataset('titanic')

# embark_town ์—ด์˜ 829ํ–‰์˜ NaN ๋ฐ์ดํ„ฐ ์ถœ๋ ฅ
print(df['embark_town'][825:830])
print('\n')

# embark_town ์—ด์˜ NaN๊ฐ’์„ ์Šน์„ ๋„์‹œ ์ค‘์—์„œ ๊ฐ€์žฅ ๋งŽ์ด ์ถœํ˜„ํ•œ ๊ฐ’์œผ๋กœ ์น˜ํ™˜ํ•˜๊ธฐ. 
most_freq = df["embark_town"].value_counts(dropna=True).idxmax()
print(most_freq)
print('\n')

df['embark_town'].fillna(most_freq, inplace=True)

#embark_town์—ด 829ํ–‰์˜ NaN ๋ฐ์ดํ„ฐ ์ถœ๋ ฅ(NaN๊ฐ’์ด most_freq๊ฐ’์œผ๋กœ ๋Œ€์ฒด)
print(df['embark_town'][825:830])
825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object


Southampton


825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object

๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๊ฐ€ ํ‘œ์‹œ๊ฐ€ ์•ˆ๋ ๋•Œ

๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๊ฐ€ NaN์œผ๋กœ ํ‘œ์‹œ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ - ๋ฐ์ดํ„ฐ์…‹ ์ค‘์—๋Š” ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๊ฐ€ NaN์œผ๋กœ ์ž…๋ ฅ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ๋„ ๋งŽ๋‹ค. ์˜ˆ๋ฅผ ๋“ค๋ฉด, ์ˆซ์ž 0์ด๋‚˜ ๋ฌธ์ž '-','?' ๊ฐ™์€ ๊ฐ’์œผ๋กœ ์ž…๋ ฅ๋˜๊ธฐ๋„ ํ•œ๋‹ค. ํŒ๋‹ค์Šค์—์„œ ๋ˆ„๋ฝ ๋ฐ์ดํ„ฐ๋ฅผ ๋‹ค๋ฃจ๋ ค๋ฉด replace() ๋ฉ”์†Œ๋“œ๋ฅผ ํ™œ์šฉํ•˜์—ฌ NumPy์—์„œ ์ง€์›ํ•˜๋Š” np.nan์œผ๋กœ ๋ณ€๊ฒฝํ•ด์ฃผ๋Š” ๊ฒƒ์ด ์ข‹๋‹ค. ๋‹จ, np.nan์„ ์‚ฌ์šฉํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” "import numpy as np"์™€ ๊ฐ™์ด NumPy๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ๋จผ์ € ์ž„ํฌํŠธํ•ด์•ผ ํ•œ๋‹ค. - ์‚ฌ์šฉ๋ฒ•: df.replace('?', np.nan, inplace=True) - nan์™ธ์˜ ๋‹ค๋ฅธ ๋ˆ„๋ฝ๊ฐ’์ด ์žˆ๋‚˜ ํ™•์ธ๋ฒ•: unicon.apply(lambda x: "?" in list(x), axis=1 )

๋ฒ”์ฃผํ™” ๋ฐ์ดํ„ฐ ํŠน์„ฑ์„ ์ด์šฉํ•œ ๊ฒฐ์ธก์น˜ ์น˜ํ™˜

๋ฐ์ดํ„ฐ์…‹์˜ ํŠน์„ฑ์ƒ ์„œ๋กœ ์ด์›ƒํ•˜๊ณ  ์žˆ๋Š” ๋ฐ์ดํ„ฐ๋ผ๋ฆฌ ์œ ์‚ฌ์„ฑ์„ ๊ฐ€์งˆ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’๋‹ค. ์ด๋Ÿด ๋•Œ๋Š” ์•ž์ด๋‚˜ ๋’ค์—์„œ ์ด์›ƒํ•˜๊ณ  ์žˆ๋Š” ๊ฐ’์œผ๋กœ ์น˜ํ™˜ํ•ด ์ฃผ๋Š” ๊ฒƒ์ด ์ข‹๋‹ค. fillna() ๋ฉ”์†Œ๋“œ์— method='ffill' ์˜ต์…˜์„ ์ถ”๊ฐ€ํ•˜๋ฉด NaN์ด ์žˆ๋Š” ํ–‰์˜ ์ง์ „ ํ–‰์— ์žˆ๋Š” ๊ฐ’์œผ๋กœ ๋ฐ”๊ฟ”์ค€๋‹ค. method='bfill' ์˜ต์…˜์„ ์‚ฌ์šฉํ•˜๋ฉด NaN์ด ์žˆ๋Š” ํ–‰์˜ ๋ฐ”๋กœ ๋‹ค์Œ ํ–‰์— ์žˆ๋Š” ๊ฐ’์„ ๊ฐ€์ง€๊ณ  ์น˜ํ™˜ํ•œ๋‹ค. ๋‹ค์Œ์˜ ์˜ˆ์ œ์—์„œ๋Š” 'ffill'์˜ต์…˜์„ ์‚ฌ์šฉํ•˜์—ฌ 829ํ–‰์˜ NaN๊ฐ’์„ ๋ฐ”๋กœ ์•ž์— ์œ„์น˜ํ•œ 828ํ–‰์˜ Queenstown์œผ๋กœ ๋ณ€๊ฒฝํ•œ๋‹ค.

import seaborn as sns
df = sns.load_dataset('titanic')
df['embark_town'].fillna(method='ffill', inplace=True)
df['embark_town'].value_counts(dropna=True)
Southampton    644
Cherbourg      169
Queenstown      78
Name: embark_town, dtype: int64

์ค‘๋ณต ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ

์ค‘๋ณต ๋ฐ์ดํ„ฐ ํ™•์ธ

- ๋™์ผํ•œ ๊ด€์ธก๊ฐ’์ด ์ค‘๋ณต๋˜๋Š”์ง€ ์—ฌ๋ถ€, ์ฆ‰ ํ–‰์˜ ๋ ˆ์ฝ”๋“œ๊ฐ€ ์ค‘๋ณต๋˜๋Š”์ง€ ์—ฌ๋ถ€๋ฅผ ํ™•์ธํ•˜๋ ค๋ฉด duplicated()๋ฉ”์†Œ๋“œ๋ฅผ ์ด์šฉํ•œ๋‹ค. ์ „์— ๋‚˜์˜จ ํ–‰๋“ค๊ณผ ๋น„๊ตํ•˜์—ฌ 
์ค‘๋ณต๋˜๋Š” ํ–‰์ด๋ฉด True๋ฅผ ๋ฐ˜ํ™˜ํ•˜๊ณ , ์ฒ˜์Œ ๋‚˜์˜ค๋Š” ํ–‰์— ๋Œ€ํ•ด์„œ๋Š” False๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
- ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์— duplicated() ๋ฉ”์†Œ๋“œ๋ฅผ ์ ์šฉํ•˜๋ฉด ๊ฐ ํ–‰์˜ ์ค‘๋ณต ์—ฌ๋ถ€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋ถˆ๋ฆฐ ์‹œ๋ฆฌ์ฆˆ๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค. 
- 0ํ–‰์˜ ๋ฐ์ดํ„ฐ๋Š” ๋’ค์— ๋‚˜์˜ค๋Š” 1ํ–‰์˜ ๋ฐ์ดํ„ฐ์™€ ๊ฐ™์ง€๋งŒ ์ฒ˜์Œ ๋‚˜์˜ค๋Š” ๊ฐ’์ด๋‹ค. ์ฆ‰, ์•ž์— ๋น„๊ตํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์•„์˜ˆ์—†๊ธฐ ๋•Œ๋ฌธ์— ์ค‘๋ณต์ด ์•„๋‹ˆ๋ผ๋Š” ๋œป์—์„œ False
๋กœ ํŒ์ •ํ•œ๋‹ค. 1ํ–‰์˜ ๋ฐ์ดํ„ฐ๋Š” ์•ž์˜ 0ํ–‰๊ณผ ์ค‘๋ณต๋˜๊ธฐ ๋•Œ๋ฌธ์— True๊ฐ€ ๋œ๋‹ค.
import pandas as pd 
df = pd.DataFrame(
{'c1':['a','a','b','a','b'],
 'c2':[1,1,1,2,2],
 'c3':[1,1,2,2,2]})
print(df)
print('\n')

df_dup = df.duplicated()
print(df_dup)
print('\n')
  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


0    False
1     True
2    False
3    False
4    False
dtype: bool

๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์˜ ์—ด์€ ์‹œ๋ฆฌ์ฆˆ ๊ฐ์ฒด์ด๋ฏ€๋กœ, duplicated() ๋ฉ”์†Œ๋“œ๋ฅผ ์ ์šฉํ•  ์ˆ˜ ์žˆ๋‹ค. ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ dfdml 'c2'์—ด์€ ์ •์ˆ˜ 1๊ณผ 2๋กœ ๊ตฌ์„ฑ๋œ๋‹ค. 1์ด ์ฒ˜์Œ ๋‚˜ํƒ€๋‚œ 0ํ–‰๊ณผ 2๊ฐ€ ์ฒ˜์Œ ๋‚˜ํƒ€๋‚œ 3ํ–‰์„ ์ œ์™ธํ•˜๊ณ  ๋‚˜๋จธ์ง€ 1,2,4ํ–‰์€ ์ด์ „์— ๋‚˜์˜จ ํ–‰๊ณผ ์ค‘๋ณต๋˜๋ฏ€๋กœ True๊ฐ€ ๋œ๋‹ค. 1,2ํ–‰์€ ๋ฐ์ดํ„ฐ 1์„ ๊ฐ€์ง„ 0ํ–‰๊ณผ ์ค‘๋ณต๋˜๊ณ , 4ํ–‰์€ ๋ฐ์ดํ„ฐ 2๋ฅผ ๊ฐ€์ง„ 3ํ–‰๊ณผ ์ค‘๋ณต๋œ๋‹ค.

col_dup = df['c2'].duplicated()
print(col_dup)
0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool

์ค‘๋ณต ๋ฐ์ดํ„ฐ ์ œ๊ฑฐ

์ค‘๋ณต ๋ฐ์ดํ„ฐ๋ฅผ ์ œ๊ฑฐํ•˜๋Š” ๋ช…๋ น์—๋Š” drop_duplicates() ๋ฉ”์†Œ๋“œ๊ฐ€ ์žˆ๋‹ค. ์ค‘๋ณต๋˜๋Š” ํ–‰์„ ์ œ๊ฑฐํ•˜๊ณ  ๊ณ ์œ ํ•œ ๊ด€์ธก๊ฐ’์„ ๊ฐ€์ง„ ํ–‰๋“ค๋งŒ ๋‚จ๊ธด๋‹ค. ์›๋ณธ ๊ฐ์ฒด๋ฅผ ๋ณ€๊ฒฝํ•˜๋ ค๋ฉด inplace=True ์˜ต์…˜์„ ์ถ”๊ฐ€ํ•œ๋‹ค. ๋‹ค์Œ ์˜ˆ์ œ์—์„œ 1ํ–‰์˜ ๋ฐ์ดํ„ฐ๋Š” ์•ž์— ์ด์›ƒํ•˜๊ณ  ์žˆ๋Š” 0ํ–‰์˜ ๋ฐ์ดํ„ฐ์™€ ์ค‘๋ณต๋˜๋ฏ€๋กœ ์ œ๊ฑฐ๋œ๋‹ค.

import pandas as pd 
df=pd.DataFrame({
    "c1":['a','a','b','a','b'],
    "c2":[1,1,1,2,2],
    "c3":[1,1,2,2,2]  
})
print(df)
print('\n')
  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2
df2 = df.drop_duplicates()
print(df2)
print('\n')
  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2

drop_duplicates() ๋ฉ”์†Œ๋“œ์˜ subset ์˜ต์…˜์— "์—ด ์ด๋ฆ„์˜ ๋ฆฌ์ŠคํŠธ"๋ฅผ ์ „๋‹ฌํ•  ์ˆ˜ ์žˆ๋‹ค. ๋ฐ์ดํ„ฐ์˜ ์ค‘๋ณต์—ฌ๋ถ€๋ฅผ ํŒ๋ณ„ํ•  ๋•Œ, subset ์˜ต์…˜์— ํ•ด๋‹นํ•˜๋Š” ์—ด์„ ๊ธฐ์ค€์œผ๋กœ ํŒ๋‹จํ•œ๋‹ค. ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ df์˜ 'c2','c3' ์—ด์„ ๊ธฐ์ค€์œผ๋กœ ํŒ๋ณ„ํ•˜๋ฉด 0ํ–‰๊ณผ 1ํ–‰, 3ํ–‰๊ณผ 4ํ–‰์˜ ๋ฐ์ดํ„ฐ๊ฐ€ ๊ฐ๊ฐ ์ค‘๋ณต๋œ๋‹ค.

df3 = df.drop_duplicates(subset=['c2','c3'])
print(df3)
  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2

๋ฐ์ดํ„ฐ ํ‘œ์ค€ํ™”

์ฝ”๋“œ ์‹ค์ œ ์˜ˆ์‹œ

ํŠน์ • ์ปฌ๋Ÿผ ๊ฐฏ์ˆ˜๊ฐ€ ์ผ์ • ์ˆ˜์ค€์˜ ๊ฐฏ์ˆ˜๋ฅผ ๋„˜๊ฒจ์•ผ True๋˜๋Š” ์ฝ”๋“œ

# ๋ฐ์ดํ„ฐ๊ฐ€ ์ผ์ • ๊ฐœ์ˆ˜ ์ด์ƒ ์กด์žฌํ•˜๋Š” ํšŒ์‚ฌ๋งŒ ๋‚จ๊ธด๋‹ค.
#df = df.groupby("ํšŒ์‚ฌID").filter(lambda x : len(x) > 24

ํŠน์ • ์ปฌ๋Ÿผ์˜ ์กฐ๊ฑด์ด True์ธ ์ „์ฒด ๋ฐ์ดํ„ฐ ์กฐํšŒ

#df = df.loc[df["์—ฐ๋งค์ถœ์•ก"]>70000000]
#df = df[df["์—ฐ๋งค์ถœ์•ก"]>70000000] ๊ฐ™์€ ์ฝ”๋“œ. 
#df = df[df['์›”๋ณ„_์ง์›์ˆ˜']>=400]
#ํŠน์ • ์กฐ๊ฑด์— ๋งž๋Š” ์ „์ฒด ๋ฐ์ดํ„ฐ๋ฅผ ์ถœ๋ ฅํ•˜๊ธฐ ์œ„ํ•ด์„  df[df[์กฐ๊ฑด]]

์ƒˆ๋กœ์šด ์—ด์— ๋ฆฌ์ŠคํŠธ๋กœ ๊ฐ’ ์ถ”๊ฐ€

# new_df["์—ฐ๋งค์ถœ์•ก_๋ณ€ํ™”๋Ÿ‰"] = change_sales_columns #์—ฌ๊ธฐ์„œ change_sales_columns๋Š” ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ์ž…๋‹ˆ๋‹ค.

Unique()ํ•จ์ˆ˜๋Š” ๋ฐ˜ํ™˜๊ฐ’์„ ๋ฆฌ์ŠคํŠธ๋กœ ๋Œ๋ ค์ค€๋‹ค.

#check_id_list = check_df['ํšŒ์‚ฌID'].unique()
#for์„ ๋Œ๋ฆด ์ˆ˜ ์žˆ๋‹ค.

[126814 294387 294337 126521 294367 126538 439986 126674 126516 507086 126664 403351 126983 227414 126606 126831 510329 227415 403359 403434 469458 419998 126802 419945 419977 126772 403462 127065 469473 469677 127090 127060 420046 420008 294530 403470]

# # ์—ฐ๋งค์ถœ์•ก ๋ณ€ํ™”๊ฐ€ ์—†๋Š” ํšŒ์‚ฌ ID ์ œ๊ฑฐ
# # ๊ธฐ์กด ๋ฆฌ์ŠคํŠธ๋ฅผ for๋ฌธ์„ ๋Œ๋ฉด์„œ ์กฐ๊ฑด์— ๋งŒ์กฑํ•œ ๋ฐ์ดํ„ฐ๋งŒ ๋ฆฌ์ŠคํŠธ ํ•„ํ„ฐ๋ง
# # ํ•„ํ„ฐ๋ง๋œ ๋ฆฌ์ŠคํŠธ๋ฅผ ๋‹ค์‹œ for๋ฌธ์„ ๋Œ๋ฉด์„œ ํ•„ํ„ฐ๋ง. 
# change_sales_list = []
# for x in check_id_list :
#     new_df = check_df[check_df['ํšŒ์‚ฌID']==x]
#     if sum(new_df['์—ฐ๋งค์ถœ์•ก_๋ณ€ํ™”๋Ÿ‰']) > 10000000 : # ์—ฐ๋งค์ถœ์•ก ์„ฑ์žฅ 100์–ต ์ด์ƒ
#         change_sales_list.append(x) 
# print(len(change_sales_list), "๊ฐœ์˜ ํšŒ์‚ฌ๊ฐ€ ์—ฐ๋งค์ถœ์•ก์ด ์ฆ๊ฐ€ํ–ˆ์Šต๋‹ˆ๋‹ค.")
# print(change_sales_list)
# # ์ง์›์ˆ˜ ๋ณ€ํ™”๊ฐ€ ์—†๋Š” ํšŒ์‚ฌ ID ์ œ๊ฑฐ

# change_worker_list = []
# for z in change_sales_list :
#     worker_df = check_df[check_df['ํšŒ์‚ฌID']==z]
#     if sum(worker_df['์›”๋ณ„_์ง์›์ˆ˜_๋ณ€ํ™”๋Ÿ‰']) > 30 : # ์ง์›์ˆ˜ 30๋ช… ์ด์ƒ
#         change_worker_list.append(z)   
# print(len(change_worker_list), "๊ฐœ์˜ ํšŒ์‚ฌ์˜ ์ง์›์ˆ˜๊ฐ€ ์ฆ๊ฐ€ํ–ˆ์Šต๋‹ˆ๋‹ค.")  
# print(change_worker_list)