如何(智能)循环遍历GeoDataframe中的所有点并查看最近的邻居
•浏览 1
How to (smartly) loop over all points in a GeoDataframe and look at nearest neighbours
我有一个大 (O(10^6) 行) 数据集(带有值的点),我需要对所有点执行以下操作:
- 在预定义的半径内找到最近的 3 个点。
- 计算这三个点的关联值的平均值。
- 将平均值保存到我正在查看的点
"非矢量化"方法是简单地遍历所有点...对于所有点,然后应用逻辑。然而,这扩展性很差。
我已经包含了一个玩具示例,它可以满足我的需求。我已经考虑过的想法是:
- 使用 shapely.ops.nearest_points: 然而,这似乎只返回一个最近的点。
- 围绕每个单独的点进行缓冲并与原始 GeoDataframe 进行连接:这似乎比天真的方法更糟糕。
这是我要实现的逻辑的玩具示例:
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
for index,row in gdf.iterrows(): # Looping over all points
gdf['dist'] = np.nan
for index2,row2 in gdf.iterrows(): # Looping over all the other points
if index==index2: continue
d=row['geometry'].distance(row2['geometry']) # Calculate distance
if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store
else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN
# Calculating mean of values for the 3 nearest points and storing
gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())
print(gdf)
points values geometry dist mean
0 POINT (1 1.1) 9 POINT (1 1.1) 2.758623 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 2.282542 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 2.002498 5.666667
3 POINT (2 1) 6 POINT (2 1) 2.236068 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 1.345362 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 1.004988 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 2.200000 4.333333
7 POINT (3 2) 2 POINT (3 2) 1.000000 3.000000
8 POINT (3 3) 1 POINT (3 3) NaN 3.666667
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
import libpysal
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)
means = []
for index, row in gdf.iterrows(): # Looping over all points
knn_neighbors = knn3.neighbors[index]
knnsubset = gdf.iloc[knn_neighbors]
neighbors = []
for ix, r in knnsubset.iterrows():
if r.geometry.distance(row.geometry) < 3: # max distance here
neighbors.append(ix)
subset = gdf.iloc[list(neighbors)]
means.append(np.mean(subset['values']))
gdf['mean'] = means points values geometry mean
0 POINT (1 1.1) 9 POINT (1 1.1) 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 5.666667
3 POINT (2 1) 6 POINT (2 1) 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 4.333333
7 POINT (3 2) 2 POINT (3 2) 3.000000
8 POINT (3 3) 1 POINT (3 3) 3.666667
生成的 GeoDataframe 在这里:
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
for index,row in gdf.iterrows(): # Looping over all points
gdf['dist'] = np.nan
for index2,row2 in gdf.iterrows(): # Looping over all the other points
if index==index2: continue
d=row['geometry'].distance(row2['geometry']) # Calculate distance
if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store
else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN
# Calculating mean of values for the 3 nearest points and storing
gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())
print(gdf)
points values geometry dist mean
0 POINT (1 1.1) 9 POINT (1 1.1) 2.758623 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 2.282542 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 2.002498 5.666667
3 POINT (2 1) 6 POINT (2 1) 2.236068 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 1.345362 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 1.004988 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 2.200000 4.333333
7 POINT (3 2) 2 POINT (3 2) 1.000000 3.000000
8 POINT (3 3) 1 POINT (3 3) NaN 3.666667
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
import libpysal
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)
means = []
for index, row in gdf.iterrows(): # Looping over all points
knn_neighbors = knn3.neighbors[index]
knnsubset = gdf.iloc[knn_neighbors]
neighbors = []
for ix, r in knnsubset.iterrows():
if r.geometry.distance(row.geometry) < 3: # max distance here
neighbors.append(ix)
subset = gdf.iloc[list(neighbors)]
means.append(np.mean(subset['values']))
gdf['mean'] = means points values geometry mean
0 POINT (1 1.1) 9 POINT (1 1.1) 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 5.666667
3 POINT (2 1) 6 POINT (2 1) 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 4.333333
7 POINT (3 2) 2 POINT (3 2) 3.000000
8 POINT (3 3) 1 POINT (3 3) 3.666667
你可以看到最后一次迭代的状态。
- 除了在 NAN 留下的最终位置之外,所有距离都已计算。
- 最后一次迭代的平均值是三个最近点的平均值:2、4和5,即3,666667。
如何以更具可扩展性的方式做到这一点?
我会为此使用空间索引。您可以使用 libpysal 的功能,它在底层使用 KDTree。对于 2000 个随机点,以下代码运行时间为 3.5 秒,而您的代码运行时间很长(第一分钟后我失去了耐心)。将值保存到列表,然后将列表转换为 DF 的列也可以节省一些时间。
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
for index,row in gdf.iterrows(): # Looping over all points
gdf['dist'] = np.nan
for index2,row2 in gdf.iterrows(): # Looping over all the other points
if index==index2: continue
d=row['geometry'].distance(row2['geometry']) # Calculate distance
if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store
else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN
# Calculating mean of values for the 3 nearest points and storing
gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())
print(gdf)
points values geometry dist mean
0 POINT (1 1.1) 9 POINT (1 1.1) 2.758623 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 2.282542 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 2.002498 5.666667
3 POINT (2 1) 6 POINT (2 1) 2.236068 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 1.345362 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 1.004988 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 2.200000 4.333333
7 POINT (3 2) 2 POINT (3 2) 1.000000 3.000000
8 POINT (3 3) 1 POINT (3 3) NaN 3.666667
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
import libpysal
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)
means = []
for index, row in gdf.iterrows(): # Looping over all points
knn_neighbors = knn3.neighbors[index]
knnsubset = gdf.iloc[knn_neighbors]
neighbors = []
for ix, r in knnsubset.iterrows():
if r.geometry.distance(row.geometry) < 3: # max distance here
neighbors.append(ix)
subset = gdf.iloc[list(neighbors)]
means.append(np.mean(subset['values']))
gdf['mean'] = means points values geometry mean
0 POINT (1 1.1) 9 POINT (1 1.1) 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 5.666667
3 POINT (2 1) 6 POINT (2 1) 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 4.333333
7 POINT (3 2) 2 POINT (3 2) 3.000000
8 POINT (3 3) 1 POINT (3 3) 3.666667
这是结果:
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
for index,row in gdf.iterrows(): # Looping over all points
gdf['dist'] = np.nan
for index2,row2 in gdf.iterrows(): # Looping over all the other points
if index==index2: continue
d=row['geometry'].distance(row2['geometry']) # Calculate distance
if d<3: gdf.at[index2,'dist']=d # If within cutoff: Store
else: gdf.at[index2,'dist']=np.nan # Otherwise, be paranoid and leave NAN
# Calculating mean of values for the 3 nearest points and storing
gdf.at[index,'mean']=np.mean(gdf.sort_values('dist').head(3)['values'].tolist())
print(gdf)
points values geometry dist mean
0 POINT (1 1.1) 9 POINT (1 1.1) 2.758623 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 2.282542 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 2.002498 5.666667
3 POINT (2 1) 6 POINT (2 1) 2.236068 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 1.345362 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 1.004988 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 2.200000 4.333333
7 POINT (3 2) 2 POINT (3 2) 1.000000 3.000000
8 POINT (3 3) 1 POINT (3 3) NaN 3.666667
import pandas as pd
import numpy as np
from shapely.wkt import loads
import geopandas as gp
import libpysal
points=[
'POINT (1 1.1)', 'POINT (1 1.9)', 'POINT (1 3.1)',
'POINT (2 1)', 'POINT (2 2.1)', 'POINT (2 2.9)',
'POINT (3 0.8)', 'POINT (3 2)', 'POINT (3 3)'
]
values=[9,8,7,6,5,4,3,2,1]
df=pd.DataFrame({'points':points,'values':values})
gdf=gp.GeoDataFrame(df,geometry=[loads(x) for x in df.points], crs={'init': 'epsg:' + str(25832)})
knn3 = libpysal.weights.KNN.from_dataframe(gdf, k=3)
means = []
for index, row in gdf.iterrows(): # Looping over all points
knn_neighbors = knn3.neighbors[index]
knnsubset = gdf.iloc[knn_neighbors]
neighbors = []
for ix, r in knnsubset.iterrows():
if r.geometry.distance(row.geometry) < 3: # max distance here
neighbors.append(ix)
subset = gdf.iloc[list(neighbors)]
means.append(np.mean(subset['values']))
gdf['mean'] = means points values geometry mean
0 POINT (1 1.1) 9 POINT (1 1.1) 6.333333
1 POINT (1 1.9) 8 POINT (1 1.9) 7.000000
2 POINT (1 3.1) 7 POINT (1 3.1) 5.666667
3 POINT (2 1) 6 POINT (2 1) 5.666667
4 POINT (2 2.1) 5 POINT (2 2.1) 4.666667
5 POINT (2 2.9) 4 POINT (2 2.9) 4.333333
6 POINT (3 0.8) 3 POINT (3 0.8) 4.333333
7 POINT (3 2) 2 POINT (3 2) 3.000000
8 POINT (3 3) 1 POINT (3 3) 3.666667