listing_data = pd.read_csv('./ListingsAirbnbScrapeExam.csv')

len(listing_data)

3585

', '.join(listing_data.columns)

'id, name, summary, space, description, experiences_offered, neighborhood_overview, notes, transit, access, interaction, house_rules, host_name, host_since, host_location, host_about, host_response_time, host_response_rate, host_acceptance_rate, host_is_superhost, host_neighbourhood, host_listings_count, host_total_listings_count, host_verifications, host_has_profile_pic, host_identity_verified, street, neighbourhood_cleansed, city, state, zipcode, market, smart_location, country_code, country, latitude, longitude, is_location_exact, property_type, room_type, accommodates, bathrooms, bedrooms, beds, bed_type, amenities, square_feet, price, weekly_price, monthly_price, security_deposit, cleaning_fee, guests_included, extra_people, minimum_nights, maximum_nights, calendar_updated, has_availability, availability_30, availability_60, availability_90, availability_365, calendar_last_scraped, number_of_reviews, first_review, last_review, review_scores_rating, review_scores_accuracy, review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location, review_scores_value, requires_license, license, jurisdiction_names, instant_bookable, cancellation_policy, require_guest_profile_picture, require_guest_phone_verification, calculated_host_listings_count, reviews_per_month'

calendar_data = pd.read_csv(
    './CalendarAirbnbScrapeExam.csv',
    converters={
        'available': lambda x: True if x == 'f' else False,
        'price': price_converter
    }
)
calendar_data['filled'] = ~calendar_data['available']
calendar_data['date'] = pd.to_datetime(calendar_data['date'],
                                       infer_datetime_format=True)

calendar_data.head(1)

listing_keys = set(listing_data.id)
calendar_keys = set(calendar_data.listing_id)
difference = listing_keys.difference(calendar_keys)
print(f'# Listing Keys: {len(listing_keys)}')
print(f'# Calendar Keys: {len(calendar_keys)}')
print(f'# Difference: {len(difference)}')

# Listing Keys: 3585
# Calendar Keys: 2872
# Difference: 713

fill_dates.head()

combined_data = listing_data.merge(
    fill_dates,
    how='left',
    left_on='id',
    right_on='listing_id'
)
combined_data.rename(
    columns={
        ('available', 'sum'): 'available',
        ('filled', 'sum'): 'filled',
        ('price', 'mean'): 'avg_price',
        ('price', 'std'): 'std_price'
    },
    inplace=True
)

/home/zoe/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (1 levels on the left, 2 on the right)
  warnings.warn(msg, UserWarning)

combined_data[['id', 'name', 'available', 'avg_price', 'std_price']].head(10)

print(f'Original Datasize: {len(calendar_data.listing_id.unique())}.')
print(f'Pruned Datasize: {len(top_listings)}')

Original Datasize: 2872.
Pruned Datasize: 81

sensitive_listings

full_combined_data = listing_data.merge(
    calendar_data,
    how='inner',
    left_on='id',
    right_on='listing_id'
)

neighborhood_data = full_combined_data\
    .groupby(['neighbourhood_cleansed', 'date'])\
    .agg({'filled': 'sum', 'price_y': 'mean'})
neighborhood_data = neighborhood_data.unstack(level=0)

neighborhood_labels

array(['Roslindale', 'Jamaica Plain', 'Mission Hill',
       'Longwood Medical Area', 'Bay Village', 'Leather District',
       'Chinatown', 'North End', 'Roxbury', 'South End', 'Back Bay',
       'East Boston', 'Charlestown', 'West End', 'Beacon Hill', 'Downtown',
       'Fenway', 'Brighton', 'West Roxbury', 'Hyde Park', 'Mattapan',
       'Dorchester', 'South Boston Waterfront', 'South Boston', 'Allston'], dtype=object)

valid_desc_data.groupby('neighbourhood_cleansed').agg('size').sort_values()

neighbourhood_cleansed
Leather District             5
Longwood Medical Area        6
Mattapan                    14
Hyde Park                   15
Bay Village                 19
West Roxbury                24
West End                    32
South Boston Waterfront     41
Roslindale                  42
Chinatown                   46
Charlestown                 53
Mission Hill                58
East Boston                 87
North End                   88
Roxbury                     92
Brighton                   105
Downtown                   108
South Boston               114
Beacon Hill                131
Dorchester                 144
Allston                    146
Fenway                     148
Back Bay                   181
South End                  225
Jamaica Plain              246
dtype: int64

top5_neighborhoods

['Allston', 'Fenway', 'Back Bay', 'South End', 'Jamaica Plain']

top5_fulltext = top5_listings[['neighbourhood_cleansed',
                               'neighborhood_overview']]
top5_fulltext.head(3)

fit = vect.fit(top5_cleantext)
X = vect.fit_transform(top5_cleantext)

X.shape

(946, 1599)

X.astype(bool).sum() / (946 * 3019)

0.0049293165834142748

print(summary)

South End:
	distanc, yourself, brownston, locat, young, across, years, wrought, would, worlds, world, wonder, discov, within, dinner, beauti

Fenway:
	dynam, young, museum, attract, years, would, worth, worri, wonder, anyth, women, without, within, multicultur, moist, modern

Back Bay:
	hospit, years, convention, wrong, convent, worth, appreci, homes, block, histori, wonder, histor, within, conveni, almost, window

Jamaica Plain:
	zagat, yummi, lucki, youth, yourself, younger, distanc, young, along, longer, locations, burger, locat, would, worth, worst

Allston:
	minut, youth, decid, younger, blocks, culture, young, block, anywher, activ, cultur, biking, between, midst, world, midnight

	available	filled	price		listing_id
	sum	sum	mean	std
listing_id
5506	21.0	344.0	147.267442	17.043196	5506
6695	41.0	324.0	197.407407	17.553300	6695
6976	46.0	319.0	65.000000	0.000000	6976
8792	117.0	248.0	154.000000	0.000000	8792
9273	1.0	364.0	225.000000	0.000000	9273

	id	name	available	avg_price	std_price
0	12147973	Sunny Bungalow in the City	365.0	NaN	NaN
1	3075044	Charming room in pet friendly apt	6.0	67.813370	4.502791
2	6976	Mexican Folk Art Haven in Boston	46.0	65.000000	0.000000
3	1436513	Spacious Sunny Bedroom Suite in Historic Home	267.0	75.000000	0.000000
4	7651065	Come Home to Boston	31.0	79.000000	0.000000
5	12386020	Private Bedroom + Great Coffee	307.0	75.000000	0.000000
6	5706985	New Lrg Studio apt 15 min to Boston	21.0	111.755814	18.403439
7	2843445	"Tranquility" on "Top of the Hill"	0.0	75.000000	0.000000
8	753446	6 miles away from downtown Boston!	18.0	59.363112	3.629618
9	849408	Perfect & Practical Boston Rental	258.0	252.925234	31.012992

	stddev	num_unique
listing_id
5455004	139.340961	108
6119918	128.657169	216
8827268	155.416406	189
14421304	128.396181	96
14421403	127.944730	105
14421692	127.944730	105

Evolve Interview Project¶

Zoë Farmer¶

Who am I?¶

General Tooling Overview¶

The Data¶

What is it?¶

(1) Listings - details about locations¶

(2) Calendar Data - location occupancy by date¶

Dataset Merge¶

Groupby¶

Left Join¶

Neighborhood Statistics¶

Seasonal Trends¶

Let's do better¶

Plotting our Busy Listings¶

Reducing Noise¶

Plotting our Indicator Listings¶

Combining Naive Occupancy and Indicator Listings¶

What does this tell us?¶

Neighborhood Specific Seasonal Trends¶

What does this tell us?¶

Examining Neighborhoods¶

How many listings per neighborhood?¶

Where are these neighborhoods?¶

Top 5 Neighborhoods¶

Allston¶

Fenway¶

Back Bay¶

South End¶

Jamaica Plain¶

Feature Extraction¶

Term Frequency - Inverse Document Frequency¶

Scikit-Learn¶

Using tf-idf¶

What does this tell us?¶

Conclusions¶

Seasonal Trends¶

Neighborhoods¶

Questions?¶

	neighbourhood_cleansed	neighborhood_overview
59	Jamaica Plain	The neighborhood is complete with all shops, r...
60	Jamaica Plain	Downtown Jamaica Plain is a delight with plent...
61	Jamaica Plain	the neighborhood is exactly that ... a neighbo...