Skip to main content

Modeling timeseries data

This is a basic example how to convert the data into events and create a declaration of features

Importing Libraries

Here, we are importing pandas for data manipulation and fexpress which refers to our custom Rust-based feature engineering library.

import os
os.system("pip install fexpress")
import pandas as pd
import fexpress as fx
Requirement already satisfied: fexpress in /Users/pawel/.pyenv/versions/3.9.16/envs/fexpress/lib/python3.9/site-packages (0.0.3)


WARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.
You should consider upgrading via the '/Users/pawel/.pyenv/versions/3.9.16/envs/fexpress/bin/python3.9 -m pip install --upgrade pip' command.

Loading Data

We are loading rows from the "weatherAUS.csv" dataset and viewing its columns.

if os.path.exists("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv"):
df = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv", nrows=10000)
else:
df = pd.read_csv("datasets/weatherAUS.csv", nrows=10000)
df.head()
DateLocationMinTempMaxTempRainfallEvaporationSunshineWindGustDirWindGustSpeedWindDir9am...Humidity9amHumidity3pmPressure9amPressure3pmCloud9amCloud3pmTemp9amTemp3pmRainTodayRainTomorrow
02008-12-01Albury13.422.90.6NaNNaNW44.0W...71.022.01007.71007.18.0NaN16.921.8NoNo
12008-12-02Albury7.425.10.0NaNNaNWNW44.0NNW...44.025.01010.61007.8NaNNaN17.224.3NoNo
22008-12-03Albury12.925.70.0NaNNaNWSW46.0W...38.030.01007.61008.7NaN2.021.023.2NoNo
32008-12-04Albury9.228.00.0NaNNaNNE24.0SE...45.016.01017.61012.8NaNNaN18.126.5NoNo
42008-12-05Albury17.532.31.0NaNNaNW41.0ENE...82.033.01010.81006.07.08.017.829.7NoNo

5 rows × 23 columns

Creating Events

We're iterating through the dataframe and creating a new event for each row, encapsulating various weather attributes. These events are added to the FeatureExpress context.

event_context = fx.FeatureExpress()
for row in df.itertuples():
event = fx.Event(
event_id=str(row.Index),
entities={"city": row.Location},
event_type="reading",
event_time=str(row.Date),
attrs={col: row.__getattribute__(col) for col in ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
'Temp3pm', 'RainToday'] if row.__getattribute__(col)==row.__getattribute__(col)},
)
event_context.new_event(event)

Configuration

Setting up the observation dates and query configurations.

obs_dates_config = fx.ObservationDateConfig(interval=fx.sdk.observation_dates_config.Interval(
date_part=fx.sdk.observation_dates_config.DatePart.DAY,
entity_types=["city"],
nth=7
))
event_scope_config=fx.sdk.event_scope_config.EventScopeConfigClass(related_entities_events=["city"])
query_config = fx.sdk.query_config.QueryConfig(include_events_on_obs_date=False, parallel=True)

Schema Information

Print the schema information for the reading event.

print(event_context.event_context.schema())
{
"reading": {
"Temp3pm": "Num",
"Cloud3pm": "Num",
"Evaporation": "Num",
"Temp9am": "Num",
"Sunshine": "Num",
"WindDir9am": "Str",
"MaxTemp": "Num",
"Pressure3pm": "Num",
"Cloud9am": "Num",
"Rainfall": "Num",
"WindDir3pm": "Str",
"Pressure9am": "Num",
"RainToday": "Str",
"WindSpeed9am": "Num",
"WindGustSpeed": "Num",
"WindGustDir": "Str",
"Humidity3pm": "Num",
"WindSpeed3pm": "Num",
"Humidity9am": "Num",
"MinTemp": "Num"
}
}

Querying Features

Using our defined configuration, we query features like average temperature, wind speed, rainfall, etc. over different time windows.

%%time
features = event_context.query(
obs_dates_config=obs_dates_config,
event_scope_config=fx.sdk.event_scope_config.EventScopeConfigClass(related_entities_events=["city"]),
query_config=query_config,
query=[
"obs_dt as obs_dt",
"@entities.city as city",
"avg(MaxTemp) over last 7 days",
"min(MinTemp) over last 7 days",
"max(WindGustSpeed) over last 3 days",
"last(Humidity3pm) over past",
"first(Humidity9am) over future",
"sum(Rainfall) over last 30 days",
"avg(WindSpeed9am) over last 5 days",
"avg(WindSpeed3pm) over last 5 days",
"last(Temp3pm) over last 3 days",
"first(Temp9am) over last 3 days",
"count(*) over last 30 days where RainToday = 'Yes' as rainy_days",
"count(*) over last 30 days where RainToday = 'No' as non_rainy_days",
"avg(Cloud9am) over last 7 days",
"avg(Cloud3pm) over last 7 days",
"sum(Pressure9am) over last 3 days",
"sum(Pressure3pm) over last 3 days",
"last(WindGustDir) over past",
"max(Temp3pm) over last 7 days",
"min(Temp9am) over last 7 days"
]
)
print(features.head())
                    obs_dt   city  avg(MaxTemp) over last 7 days   
0 2008-12-31 23:59:59.999 Cobar NaN \
1 2009-01-07 23:59:59.999 Cobar 35.899998
2 2009-01-14 23:59:59.999 Cobar 36.757145
3 2009-01-21 23:59:59.999 Cobar 35.871429
4 2009-01-28 23:59:59.999 Cobar 36.542858

min(MinTemp) over last 7 days max(WindGustSpeed) over last 3 days
0 NaN NaN \
1 15.500000 43.0
2 16.100000 43.0
3 17.900000 59.0
4 19.700001 46.0

last(Humidity3pm) over past first(Humidity9am) over future
0 NaN 20.0 \
1 19.0 33.0
2 15.0 24.0
3 52.0 71.0
4 17.0 31.0

sum(Rainfall) over last 30 days avg(WindSpeed9am) over last 5 days
0 NaN NaN \
1 0.0 15.0
2 0.0 19.4
3 4.8 18.6
4 31.4 16.0

avg(WindSpeed3pm) over last 5 days ... first(Temp9am) over last 3 days
0 NaN ... NaN \
1 12.200000 ... 29.100000
2 10.600000 ... 29.799999
3 17.200001 ... 26.200001
4 10.800000 ... 28.700001

rainy_days non_rainy_days avg(Cloud9am) over last 7 days
0 0 0 NaN \
1 0 7 2.333333
2 0 14 1.000000
3 1 20 2.428571
4 3 25 2.571429

avg(Cloud3pm) over last 7 days sum(Pressure9am) over last 3 days
0 NaN NaN \
1 4.571429 3031.100098
2 2.571429 3039.100098
3 4.142857 3036.199951
4 3.428571 3044.200195

sum(Pressure3pm) over last 3 days last(WindGustDir) over past
0 NaN None \
1 3023.899902 N
2 3030.600098 SSW
3 3027.899902 N
4 3036.199951 E

max(Temp3pm) over last 7 days min(Temp9am) over last 7 days
0 NaN NaN
1 37.599998 20.299999
2 38.099998 20.700001
3 37.799999 19.900000
4 38.700001 24.400000

[5 rows x 21 columns]
CPU times: user 411 ms, sys: 35.7 ms, total: 447 ms
Wall time: 157 ms