case1="It is:24"
case2="It is: 34"
# try 1
case1.split(':')[1]

'24'


#try 2:
case2.split(':')[1] #notar que hay un espacio en blanco --> no lo va a poder traducir

' 34'


case1.split(':')[1].strip()

'24'


case2.split(':')[1].strip() #strip() solo quista los espacios al inicio y al final

'34'


import pandas as pd

ages=["It is:24","It is: 44","It is54",
      "64 it is","I am twenty","The 10th I turn 21",
      "I am 15 years old","~20"]

someData=pd.DataFrame({'age':ages})
someData


someData.age.str.split(':') #los pandas funcionan con .str --> dividió donde hay (:) --> pero no todos no tienen (:)

0             [It is, 24]
1            [It is,  44]
2               [It is54]
3              [64 it is]
4           [I am twenty]
5    [The 10th I turn 21]
6     [I am 15 years old]
7                   [~20]
Name: age, dtype: object


someData.age.str.split(':',expand=True)
#si todos tuvieran (:) lo separaría en 2 columnas, pero hay elementos q no tienen (:)


# do each cell has a character that is not a number? (\D)
someData.age.str.contains(pat=r'\D', #\D en regex es todo lo que no es numero --> TRUE (todas tienen valores no numr)
                          regex=True)
#regex : regular expressions
#el pandas debe buscar ciertos patrones en el texto

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
Name: age, dtype: bool


# do each cell has a number character? (\d) 
someData.age.str.contains(pat=r'\d',regex=True) #la fila 4: false --> no tiene nrs

0     True
1     True
2     True
3     True
4    False
5     True
6     True
7     True
Name: age, dtype: bool


# filter
someData[~someData.age.str.contains(pat=r'\d',regex=True)] #~ indica negacion


# is there a cell where you have 
# symbols beyond [^ ] alphanumeric (\w) or spaces (\s)?  
someData.age[someData.age.str.contains(pat=r'[^\w\s]',regex=True)] #\w: de a-z

0     It is:24
1    It is: 44
7          ~20
Name: age, dtype: object


# what happens if I erase all non numbers (\D)?
someData.age.str.replace(pat=r'\D',repl='',regex=True) #se usa regex=True para que se interprete \D (o cualquier patron)

0      24
1      44
2      54
3      64
4        
5    1021
6      15
7      20
Name: age, dtype: object


# what happens if I extract consecutive numeric characters (\d+) ?
someData.age.str.extract(pat=r'(\d+)',expand=True)


# what happens if I erase all 
# numbers (\d+) followed by a texts [[a-z]+] ?
someData.age.str.replace(pat=r'\d+[a-z]+',
                         repl='',
                         regex=True)

0             It is:24
1            It is: 44
2              It is54
3             64 it is
4          I am twenty
5       The  I turn 21
6    I am 15 years old
7                  ~20
Name: age, dtype: object


# so:
someData.age.str.replace(pat=r'\d+[a-z]+',
                         repl='',
                         regex=True).\
             str.extract(pat=r'(\d+)',expand=True) #todo lo coloca en una misma columna


# using '|' ("or")
# ^ beginning of string
# $ end of the string
someData.age.str.extract(pat=r'(^\d+|\d+$|\s\d+\s)',
                         expand=True)


someData['age1']=someData.age.str.replace(pat=r'\d+[a-z]+',
                                          repl='',
                                          regex=True).\
                                str.extract(pat=r'(\d+)',expand=True)

someData['age2']=someData.age.str.extract(pat=r'(^\d+|\d+$|\s\d+\s)',
                         expand=True)


# we have
someData


#checking data type
someData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     8 non-null      object
 1   age1    7 non-null      object
 2   age2    7 non-null      object
dtypes: object(3)
memory usage: 324.0+ bytes


# they are same....right?
someData['age1'].to_list()==someData['age2'].to_list() #compara la lista completa

False


# where is the difference?
someData['age1']==someData['age2'] #por casos

0     True
1     True
2     True
3     True
4    False
5     True
6    False
7     True
dtype: bool


# the same in both:
set(someData['age1']) & set(someData['age2'])

{'20', '21', '24', '44', '54', '64', nan}


#different in both
set(someData['age1']) ^ set(someData['age2']) #hay un espacio en uno

{' 15 ', '15'}


someData['age1'].to_list()

['24', '44', '54', '64', nan, '21', '15', '20']


someData['age2'].to_list()

['24', '44', '54', '64', nan, '21', ' 15 ', '20']


someData['age2'].str.strip().to_list()

['24', '44', '54', '64', nan, '21', '15', '20']


from IPython.display import IFrame  
ciaLink1="https://www.cia.gov/the-world-factbook/field/carbon-dioxide-emissions/country-comparison" 
IFrame(ciaLink1, width=900, height=900) #solo muestra la pag web


!pip show pandas html5lib lxml beautifulsoup4

Name: pandas
Version: 2.0.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: 
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Location: C:\Users\Anali\anaconda3\envs\SDS-2023_2-burgos\Lib\site-packages
Requires: numpy, numpy, python-dateutil, pytz, tzdata
Required-by: 
---
Name: html5lib
Version: 1.1
Summary: HTML parser based on the WHATWG HTML specification
Home-page: https://github.com/html5lib/html5lib-python
Author: 
Author-email: 
License: MIT License
Location: C:\Users\Anali\anaconda3\envs\SDS-2023_2-burgos\Lib\site-packages
Requires: six, webencodings
Required-by: 
---
Name: lxml
Version: 4.9.2
Summary: Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.
Home-page: https://lxml.de/
Author: lxml dev team
Author-email: lxml-dev@lxml.de
License: BSD-3-Clause
Location: C:\Users\Anali\anaconda3\envs\SDS-2023_2-burgos\Lib\site-packages
Requires: 
Required-by: nbconvert
---
Name: beautifulsoup4
Version: 4.12.2
Summary: Screen-scraping library
Home-page: 
Author: 
Author-email: Leonard Richardson <leonardr@segfault.org>
License: 
Location: C:\Users\Anali\anaconda3\envs\SDS-2023_2-burgos\Lib\site-packages
Requires: soupsieve
Required-by: nbconvert


# read web table into pandas DF
import pandas as pd

linkToFile='https://github.com/CienciaDeDatosEspacial/code_and_data/raw/main/data/carbonEmi_downloaded.csv'
carbon=pd.read_csv(linkToFile)


# here it is:
carbon


# also
carbon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 218 non-null    object
 1   slug                 218 non-null    object
 2   value                218 non-null    object
 3   date_of_information  218 non-null    object
 4   ranking              218 non-null    int64 
 5   region               218 non-null    object
dtypes: int64(1), object(5)
memory usage: 10.3+ KB


# frequency table
carbon.region.value_counts()

region
Africa                               55
Europe                               43
Central America and the Caribbean    27
East and Southeast Asia              20
Australia and Oceania                20
Middle East                          19
South America                        13
South Asia                            8
North America                         6
Central Asia                          6
Antarctica                            1
Name: count, dtype: int64


# frequency table
carbon.date_of_information.value_counts()

date_of_information
2019 est.    216
2012 est.      1
2017 est.      1
Name: count, dtype: int64


# import os

# carbonCleaned.to_csv(os.path.join("data","carbonCleaned.csv"),index=False)


#(1)
#creamos una copia: 
carbon_new=carbon.copy()
#elegimos las columnas que queremos eliminar
byeColumn=['slug']
carbon_new.drop(columns=byeColumn,inplace=True)
carbon_new


#(2)
carbon_new.rename(columns={"date_of_information":"carbon_date"}, inplace=True)
carbon_new


#(3)
carbon_new.region.str.strip()

0      East and Southeast Asia
1                North America
2                   South Asia
3                 Central Asia
4      East and Southeast Asia
                ...           
213                 Antarctica
214                     Africa
215      Australia and Oceania
216      Australia and Oceania
217      Australia and Oceania
Name: region, Length: 218, dtype: object


#(4)
#primero chequeamos si hay símbolos que no sean numéricos o puntos en la "value"
#\D son valores no numericos 
carbon_new.value.str.contains(pat=r'\D',regex=True) #nos sale True --> sí contienen puntos o símbolos no numéricos

0      True
1      True
2      True
3      True
4      True
       ... 
213    True
214    True
215    True
216    True
217    True
Name: value, Length: 218, dtype: bool


#(4)
#ahora chequeamos en la columna 'carbon_date'
carbon_new.carbon_date.str.contains(pat=r'\D', regex=True)

0      True
1      True
2      True
3      True
4      True
       ... 
213    True
214    True
215    True
216    True
217    True
Name: carbon_date, Length: 218, dtype: bool


#(5)
#verificamos si hay espacios vacios en la columna 'name'
carbon_new.name.str.contains(pat='\s', regex=True)

0      False
1       True
2      False
3      False
4      False
       ...  
213    False
214     True
215    False
216     True
217    False
Name: name, Length: 218, dtype: bool


#(6)
#eliminamos los espacios
carbon2=carbon_new.name.str.replace(pat='\s',repl='',regex=True)
carbon2

0                                        China
1                                 UnitedStates
2                                        India
3                                       Russia
4                                        Japan
                        ...                   
213                                 Antarctica
214    SaintHelena,Ascension,andTristandaCunha
215                                       Niue
216                     NorthernMarianaIslands
217                                     Tuvalu
Name: name, Length: 218, dtype: object


#carbon3=carbon_new.carbon_date.str.extract(pat=r'(^\w+|.$)',expand=True)
carbon_new['carbon_date']=carbon_new.carbon_date.str.extract(pat=r'(^\w+|.$)',expand=True)
carbon_new['name']=carbon_new.name.str.replace(pat='\s',repl='',regex=True)
carbon_new


carbon_new['carbon_date'].value_counts()

carbon_date
2019    216
2012      1
2017      1
Name: count, dtype: int64


carbonCleaned=carbon_new
carbonCleaned


import os
carbonCleaned.to_csv(os.path.join("test2","carbonCleaned.csv"),index=False)


#como actualizo los cambios en las columnas (al usar replace y extract) en todo el data frame
#carbon_new.carbon_date.str.contains(pat=r'\s',regex=True)


from IPython.display import IFrame  
ciaLink2="https://www.cia.gov/the-world-factbook/field/revenue-from-forest-resources/country-comparison" 
IFrame(ciaLink2, width=900, height=900)


# read web table into pandas DF
import pandas as pd

forestDFs=pd.read_html(ciaLink2, # link
                        header=0, # where is the header?
                        flavor='bs4')


# check object type 
type(forestDFs)

list


# check size
len(forestDFs)

1


# make a copy
forest=forestDFs[0].copy()


# here it is
forest


# see column names:

forest.columns

Index(['Rank', 'Country', '% of GDP', 'Date of Information'], dtype='object')


# or
forest.columns.to_list()

['Rank', 'Country', '% of GDP', 'Date of Information']


forest['Date of Information'].value_counts()

Date of Information
2018 est.    195
2017 est.      7
2015 est.      1
2016 est.      1
Name: count, dtype: int64


forest_new=forest.copy()
forest_new


#1
forest_new.rename(columns=lambda s:s.replace("%","pct"), inplace=True)
#forest_new.rename(columns={'% of GDP':'pct of GDP'})
#forest_new.columns=forest_new.columns.str.replace("%","pct")
forest_new


#2
byeColumn=['Rank']
forest_new.drop(columns=byeColumn,inplace=True)
forest_new


forest_new


#3
forest_new.rename(columns={'Date of Information':'forest_date'},inplace=True)
forest_new


#4
#forest_new.columns.str.contains(pat=r'\s',regex=True)
forest_new["Country"]=forest_new.Country.str.replace(pat=r'\s',repl='',regex=True) #no sé si es cambiar los espacios de los títulos
#de las columnas o de los datos
forest_new


#5
forest_new["Country"]=forest_new.Country.str.strip()
forest_new


#6
forest_new['forest_date']=forest_new.forest_date.str.extract(pat=r'(^\w+|.$)',expand=True)
forest_new


forestCleaned=forest_new
forestCleaned


import os

forestCleaned.to_csv(os.path.join("test2","forestCleaned.csv"),index=False)

	Country	pct of GDP	Date of Information
0	Solomon Islands	20.27	2018 est.
1	Liberia	13.27	2018 est.
2	Burundi	10.31	2018 est.
3	Guinea-Bissau	9.24	2018 est.
4	Central African Republic	8.99	2018 est.
...	...	...	...
199	Guam	0.00	2018 est.
200	Faroe Islands	0.00	2017 est.
201	Aruba	0.00	2017 est.
202	Virgin Islands	0.00	2017 est.
203	Macau	0.00	2018 est.

	Country	pct of GDP	Date of Information
0	Solomon Islands	20.27	2018 est.
1	Liberia	13.27	2018 est.
2	Burundi	10.31	2018 est.
3	Guinea-Bissau	9.24	2018 est.
4	Central African Republic	8.99	2018 est.
...	...	...	...
199	Guam	0.00	2018 est.
200	Faroe Islands	0.00	2017 est.
201	Aruba	0.00	2017 est.
202	Virgin Islands	0.00	2017 est.
203	Macau	0.00	2018 est.

	Country	pct of GDP	forest_date
0	Solomon Islands	20.27	2018 est.
1	Liberia	13.27	2018 est.
2	Burundi	10.31	2018 est.
3	Guinea-Bissau	9.24	2018 est.
4	Central African Republic	8.99	2018 est.
...	...	...	...
199	Guam	0.00	2018 est.
200	Faroe Islands	0.00	2017 est.
201	Aruba	0.00	2017 est.
202	Virgin Islands	0.00	2017 est.
203	Macau	0.00	2018 est.

	Country	pct of GDP	forest_date
0	SolomonIslands	20.27	2018 est.
1	Liberia	13.27	2018 est.
2	Burundi	10.31	2018 est.
3	Guinea-Bissau	9.24	2018 est.
4	CentralAfricanRepublic	8.99	2018 est.
...	...	...	...
199	Guam	0.00	2018 est.
200	FaroeIslands	0.00	2017 est.
201	Aruba	0.00	2017 est.
202	VirginIslands	0.00	2017 est.
203	Macau	0.00	2018 est.

	Country	pct of GDP	forest_date
0	SolomonIslands	20.27	2018 est.
1	Liberia	13.27	2018 est.
2	Burundi	10.31	2018 est.
3	Guinea-Bissau	9.24	2018 est.
4	CentralAfricanRepublic	8.99	2018 est.
...	...	...	...
199	Guam	0.00	2018 est.
200	FaroeIslands	0.00	2017 est.
201	Aruba	0.00	2017 est.
202	VirginIslands	0.00	2017 est.
203	Macau	0.00	2018 est.

Data Cleaning¶

REGEX VERSUS DIVIDE AND CONQUER¶

Exercises:¶

	age
0	It is:24
1	It is: 44
2	It is54
3	64 it is
4	I am twenty
5	The 10th I turn 21
6	I am 15 years old
7	~20

	0	1
0	It is	24
1	It is	44
2	It is54	None
3	64 it is	None
4	I am twenty	None
5	The 10th I turn 21	None
6	I am 15 years old	None
7	~20	None

	name	slug	value	date_of_information	ranking	region
0	China	china	10,773,248,000.0	2019 est.	1	East and Southeast Asia
1	United States	united-states	5,144,361,000.0	2019 est.	2	North America
2	India	india	2,314,738,000.0	2019 est.	3	South Asia
3	Russia	russia	1,848,070,000.0	2019 est.	4	Central Asia
4	Japan	japan	1,103,234,000.0	2019 est.	5	East and Southeast Asia
...	...	...	...	...	...	...
213	Antarctica	antarctica	28,000.0	2019 est.	214	Antarctica
214	Saint Helena, Ascension, and Tristan da Cunha	saint-helena-ascension-and-tristan-da-cunha	13,000.0	2019 est.	215	Africa
215	Niue	niue	8,000.0	2019 est.	216	Australia and Oceania
216	Northern Mariana Islands	northern-mariana-islands	0.0	2019 est.	217	Australia and Oceania
217	Tuvalu	tuvalu	0.0	2019 est.	218	Australia and Oceania

	name	value	carbon_date	ranking	region
0	China	10,773,248,000.0	2019	1	East and Southeast Asia
1	UnitedStates	5,144,361,000.0	2019	2	North America
2	India	2,314,738,000.0	2019	3	South Asia
3	Russia	1,848,070,000.0	2019	4	Central Asia
4	Japan	1,103,234,000.0	2019	5	East and Southeast Asia
...	...	...	...	...	...
213	Antarctica	28,000.0	2019	214	Antarctica
214	SaintHelena,Ascension,andTristandaCunha	13,000.0	2019	215	Africa
215	Niue	8,000.0	2019	216	Australia and Oceania
216	NorthernMarianaIslands	0.0	2019	217	Australia and Oceania
217	Tuvalu	0.0	2019	218	Australia and Oceania

	Country	pct of GDP	forest_date
0	SolomonIslands	20.27	2018
1	Liberia	13.27	2018
2	Burundi	10.31	2018
3	Guinea-Bissau	9.24	2018
4	CentralAfricanRepublic	8.99	2018
...	...	...	...
199	Guam	0.00	2018
200	FaroeIslands	0.00	2017
201	Aruba	0.00	2017
202	VirginIslands	0.00	2017
203	Macau	0.00	2018

	0
0	24
1	44
2	54
3	64
4	NaN
5	10
6	15
7	20

	0
0	24
1	44
2	54
3	64
4	NaN
5	21
6	15
7	20

	0
0	24
1	44
2	54
3	64
4	NaN
5	21
6	15
7	20